##Exploratory Data Analysis

##Analysing Station_GeoLocation_Longitute_Latitude_Elevation_EPSG_4326.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Station)
## [1] 8 4
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Station)
## Rows: 8
## Columns: 4
## $ longitude     <dbl> 73.0167, 80.2500, 77.2000, 80.9330, 72.8500, 77.5833, 85…
## $ Latitude      <dbl> 26.3000, 13.0667, 28.5833, 26.8667, 19.1167, 12.9667, 20…
## $ Elevation     <int> 217, 6, 211, 110, 8, 920, NA, NA
## $ Location_Name <chr> "Bangalore", "Chennai", "Delhi", "Lucknow", "Mumbai", "R…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Station)
## [1] "longitude"     "Latitude"      "Elevation"     "Location_Name"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Station)
## 'data.frame':    8 obs. of  4 variables:
##  $ longitude    : num  73 80.2 77.2 80.9 72.8 ...
##  $ Latitude     : num  26.3 13.1 28.6 26.9 19.1 ...
##  $ Elevation    : int  217 6 211 110 8 920 NA NA
##  $ Location_Name: chr  "Bangalore" "Chennai" "Delhi" "Lucknow" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Station)
##    longitude        Latitude       Elevation     Location_Name     
##  Min.   :72.85   Min.   :12.97   Min.   :  6.0   Length:8          
##  1st Qu.:76.15   1st Qu.:17.60   1st Qu.: 33.5   Class :character  
##  Median :78.92   Median :21.23   Median :160.5   Mode  :character  
##  Mean   :79.07   Mean   :21.17   Mean   :245.3                     
##  3rd Qu.:81.92   3rd Qu.:26.44   3rd Qu.:215.5                     
##  Max.   :85.83   Max.   :28.58   Max.   :920.0                     
##                                  NA's   :2
attach(Weather_Station)
## Only Elevation seems to have some missing data, lets zoom into them
Weather_Station[is.na(Elevation),]
##   longitude Latitude Elevation Location_Name
## 7   85.8333  20.2500        NA   Bubhneshwar
## 8   84.8833  22.2167        NA      Rourkela
miss_var_summary(Weather_Station)
## # A tibble: 4 × 3
##   variable      n_miss pct_miss
##   <chr>          <int>    <dbl>
## 1 Elevation          2       25
## 2 longitude          0        0
## 3 Latitude           0        0
## 4 Location_Name      0        0
prop_miss(Weather_Station)
## [1] 0.0625
## Nothing special about why Bubhneshwar and Rourkela alone seems to have elevation missing
## No cleaning needed of its data

##Analysing and Performing Imputations on Bangalore_1990_2022_BangaloreCity.csv

## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bangalore)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bangalore)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bangalore)
##         time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4   NA
## 2 02-01-1990 21.7   NA 26.5    0
## 3 03-01-1990 21.0 16.4 26.5    0
## 4 04-01-1990 20.8   NA 27.4    0
## 5 05-01-1990 20.4 14.2 26.1    0
## 6 06-01-1990 20.4 17.1 24.2   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bangalore)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8  0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9  0.0
## 11892 23-07-2022 23.1 20.9 26.7  0.0
## 11893 24-07-2022 22.8 20.0 26.7  0.3
## 11894 25-07-2022 24.1 20.2 28.5  0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Bangalore)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bangalore)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
##  $ tmin: num  19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
##  $ tmax: num  28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
##  $ prcp: num  NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bangalore)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :17.20   Min.   : 9.30   Min.   :19.80  
##  Class :character   1st Qu.:22.30   1st Qu.:18.10   1st Qu.:27.90  
##  Mode  :character   Median :23.50   Median :19.80   Median :29.50  
##                     Mean   :23.84   Mean   :19.39   Mean   :29.93  
##                     3rd Qu.:25.20   3rd Qu.:20.80   3rd Qu.:32.00  
##                     Max.   :32.40   Max.   :27.90   Max.   :39.20  
##                     NA's   :70      NA's   :1389    NA's   :629    
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  4.414  
##  3rd Qu.:  2.000  
##  Max.   :271.300  
##  NA's   :4620
## Lets analyse the missing data of the dataset
n_miss(Weather_Bangalore) ## Total number of missing parameters
## [1] 6708
miss_var_summary(Weather_Bangalore) ## Missingness summary
## # A tibble: 5 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <dbl>
## 1 prcp       4620   38.8  
## 2 tmin       1389   11.7  
## 3 tmax        629    5.29 
## 4 tavg         70    0.589
## 5 time          0    0
miss_var_span(Weather_Bangalore, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     42        208     0.168         0.832       250
##  2            2     47        203     0.188         0.812       250
##  3            3     50        200     0.2           0.8         250
##  4            4     41        209     0.164         0.836       250
##  5            5     34        216     0.136         0.864       250
##  6            6     31        219     0.124         0.876       250
##  7            7     39        211     0.156         0.844       250
##  8            8     18        232     0.072         0.928       250
##  9            9     46        204     0.184         0.816       250
## 10           10     38        212     0.152         0.848       250
## # ℹ 38 more rows
miss_var_table(Weather_Bangalore)
## # A tibble: 5 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1       20
## 2            70      1       20
## 3           629      1       20
## 4          1389      1       20
## 5          4620      1       20
vis_miss(Weather_Bangalore) ## visualise % of missing

gg_miss_upset(Weather_Bangalore) ## plot for missing data

gg_miss_fct(x = Weather_Bangalore, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Bangalore, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Bangalore, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
##   Variable     n
##   <chr>    <int>
## 1 time         0
## 2 tavg         0
## 3 tmin         0
## 4 tmax         0
## 5 prcp         0
##Create shadow matrix data
head(as_shadow(Weather_Bangalore))
## # A tibble: 6 × 5
##   time_NA tavg_NA tmin_NA tmax_NA prcp_NA
##   <fct>   <fct>   <fct>   <fct>   <fct>  
## 1 !NA     !NA     !NA     !NA     NA     
## 2 !NA     !NA     NA      !NA     !NA    
## 3 !NA     !NA     !NA     !NA     !NA    
## 4 !NA     !NA     NA      !NA     !NA    
## 5 !NA     !NA     !NA     !NA     !NA    
## 6 !NA     !NA     !NA     !NA     NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Bangalore, only_miss = TRUE))
## # A tibble: 6 × 9
##   time        tavg  tmin  tmax  prcp tavg_NA tmin_NA tmax_NA prcp_NA
##   <chr>      <dbl> <dbl> <dbl> <dbl> <fct>   <fct>   <fct>   <fct>  
## 1 01-01-1990  22.9  19.1  28.4    NA !NA     !NA     !NA     NA     
## 2 02-01-1990  21.7  NA    26.5     0 !NA     NA      !NA     !NA    
## 3 03-01-1990  21    16.4  26.5     0 !NA     !NA     !NA     !NA    
## 4 04-01-1990  20.8  NA    27.4     0 !NA     NA      !NA     !NA    
## 5 05-01-1990  20.4  14.2  26.1     0 !NA     !NA     !NA     !NA    
## 6 06-01-1990  20.4  17.1  24.2    NA !NA     !NA     !NA     NA
# Lets explore the relations ship with the missing values
Weather_Bangalore %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          23.7    2.17
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Bangalore) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Combining variables of class <shade> and <factor> was deprecated in ggplot2
## 3.4.0.
## ℹ Please ensure your variables are compatible before plotting (location:
##   `join_keys()`)
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 70 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Bangalore, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Bangalore, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
Weather_Bangalore_imp <- impute_below_all(Weather_Bangalore)
ggplot(Weather_Bangalore_imp, aes(x = tavg, y = prcp)) + geom_miss_point()

# But we need to track the imputed values as well
Weather_Bangalore_imp_track <- bind_shadow(Weather_Bangalore) %>% impute_below_all()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `time_NA = (function (x, ...) ...`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## ℹ The deprecated feature was likely used in the naniar package.
##   Please report the issue at <https://github.com/njtierney/naniar/issues>.
ggplot(Weather_Bangalore_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Bangalore_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Bangalore_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Bangalore_imp_lm_temp <- Weather_Bangalore %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()

ggplot(Weather_Bangalore_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()

##Analysing and Performing Imputations on Chennai_1990_2022_Madras.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Chennai)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Chennai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 25.2, 24.9, 25.6, 25.7, 25.5, 24.7, 25.4, 25.6, 24.8, 24.7, 24.5,…
## $ tmin <dbl> 22.8, 21.7, 21.4, NA, 20.7, NA, 23.3, 22.0, 21.7, 20.7, 20.0, 18.…
## $ tmax <dbl> 28.4, 29.1, 29.8, 28.7, 28.4, 26.1, 27.0, 28.0, 28.5, 29.0, 28.8,…
## $ prcp <dbl> 0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 18.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Chennai)
##         time tavg tmin tmax prcp
## 1 01-01-1990 25.2 22.8 28.4  0.5
## 2 02-01-1990 24.9 21.7 29.1  0.0
## 3 03-01-1990 25.6 21.4 29.8  0.0
## 4 04-01-1990 25.7   NA 28.7  0.0
## 5 05-01-1990 25.5 20.7 28.4  0.0
## 6 06-01-1990 24.7   NA 26.1  0.5
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Chennai)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 28.9 26.2 33.0  9.3
## 11890 21-07-2022 28.4 24.5 32.8 21.1
## 11891 22-07-2022 27.8 24.6 32.2 22.1
## 11892 23-07-2022 27.4 24.7 32.6 18.6
## 11893 24-07-2022 27.8 25.0 33.3  9.1
## 11894 25-07-2022 28.1 25.4 32.6  2.9
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Chennai)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Chennai)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  25.2 24.9 25.6 25.7 25.5 24.7 25.4 25.6 24.8 24.7 ...
##  $ tmin: num  22.8 21.7 21.4 NA 20.7 NA 23.3 22 21.7 20.7 ...
##  $ tmax: num  28.4 29.1 29.8 28.7 28.4 26.1 27 28 28.5 29 ...
##  $ prcp: num  0.5 0 0 0 0 0.5 18 0.5 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Chennai)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :20.90   Min.   :12.00   Min.   :23.80  
##  Class :character   1st Qu.:26.30   1st Qu.:22.60   1st Qu.:31.10  
##  Mode  :character   Median :28.70   Median :24.60   Median :34.00  
##                     Mean   :28.49   Mean   :24.38   Mean   :33.91  
##                     3rd Qu.:30.40   3rd Qu.:26.40   3rd Qu.:36.20  
##                     Max.   :36.60   Max.   :31.00   Max.   :44.60  
##                     NA's   :27      NA's   :3084    NA's   :1019   
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  6.244  
##  3rd Qu.:  3.000  
##  Max.   :344.900  
##  NA's   :4886
sum(is.na(Weather_Chennai))
## [1] 9016
## About 9016 entries are NA

## Lets analyse the missing data of the dataset
n_miss(Weather_Chennai) ## Total number of missing parameters
## [1] 9016
miss_var_summary(Weather_Chennai) ## Missingness summary
## # A tibble: 5 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <dbl>
## 1 prcp       4886   41.1  
## 2 tmin       3084   25.9  
## 3 tmax       1019    8.57 
## 4 tavg         27    0.227
## 5 time          0    0
miss_var_span(Weather_Chennai, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     27        223     0.108         0.892       250
##  2            2     22        228     0.088         0.912       250
##  3            3     30        220     0.12          0.88        250
##  4            4     27        223     0.108         0.892       250
##  5            5     17        233     0.068         0.932       250
##  6            6     31        219     0.124         0.876       250
##  7            7     38        212     0.152         0.848       250
##  8            8     17        233     0.068         0.932       250
##  9            9     24        226     0.096         0.904       250
## 10           10     23        227     0.092         0.908       250
## # ℹ 38 more rows
miss_var_table(Weather_Chennai)
## # A tibble: 5 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1       20
## 2            27      1       20
## 3          1019      1       20
## 4          3084      1       20
## 5          4886      1       20
vis_miss(Weather_Chennai) ## visualise % of missing

gg_miss_upset(Weather_Chennai) ## plot for missing data

gg_miss_fct(x = Weather_Chennai, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Chennai, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Chennai, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
##   Variable     n
##   <chr>    <int>
## 1 time         0
## 2 tavg         0
## 3 tmin         0
## 4 tmax         0
## 5 prcp         0
##Create shadow matrix data
head(as_shadow(Weather_Chennai))
## # A tibble: 6 × 5
##   time_NA tavg_NA tmin_NA tmax_NA prcp_NA
##   <fct>   <fct>   <fct>   <fct>   <fct>  
## 1 !NA     !NA     !NA     !NA     !NA    
## 2 !NA     !NA     !NA     !NA     !NA    
## 3 !NA     !NA     !NA     !NA     !NA    
## 4 !NA     !NA     NA      !NA     !NA    
## 5 !NA     !NA     !NA     !NA     !NA    
## 6 !NA     !NA     NA      !NA     !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Chennai, only_miss = TRUE))
## # A tibble: 6 × 9
##   time        tavg  tmin  tmax  prcp tavg_NA tmin_NA tmax_NA prcp_NA
##   <chr>      <dbl> <dbl> <dbl> <dbl> <fct>   <fct>   <fct>   <fct>  
## 1 01-01-1990  25.2  22.8  28.4   0.5 !NA     !NA     !NA     !NA    
## 2 02-01-1990  24.9  21.7  29.1   0   !NA     !NA     !NA     !NA    
## 3 03-01-1990  25.6  21.4  29.8   0   !NA     !NA     !NA     !NA    
## 4 04-01-1990  25.7  NA    28.7   0   !NA     NA      !NA     !NA    
## 5 05-01-1990  25.5  20.7  28.4   0   !NA     !NA     !NA     !NA    
## 6 06-01-1990  24.7  NA    26.1   0.5 !NA     NA      !NA     !NA
# Lets explore the relations ship with the missing values
Weather_Chennai %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          28.2    2.48
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Chennai) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 27 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Chennai, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Chennai, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
Weather_Chennai_imp <- impute_below_all(Weather_Chennai)
ggplot(Weather_Chennai_imp, aes(x = tavg, y = prcp)) + geom_miss_point()

# But we need to track the imputed values as well
Weather_Chennai_imp_track <- bind_shadow(Weather_Chennai) %>% impute_below_all()
ggplot(Weather_Chennai_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Chennai_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Chennai_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Chennai_imp_lm_temp <- Weather_Chennai %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()

ggplot(Weather_Chennai_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()

##Analysing and Performing Imputations on Delhi_NCR_1990_2022_Safdarjung.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Delhi)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Delhi)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 9.4, 9.3, 9.0, 10.7, 12.6, 14.9, 14.4, 10.7, 13.4, 16.6, 17.0, 17…
## $ tmin <dbl> 6.0, 5.2, 6.5, 6.0, 7.3, 8.1, 8.1, 8.5, 7.0, NA, 10.9, 9.8, 8.8, …
## $ tmax <dbl> 15.1, 14.2, 13.6, 17.5, 20.8, 22.9, 21.4, 16.6, 20.6, 22.8, 25.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Delhi)
##         time tavg tmin tmax prcp
## 1 01-01-1990  9.4  6.0 15.1    0
## 2 02-01-1990  9.3  5.2 14.2    0
## 3 03-01-1990  9.0  6.5 13.6    0
## 4 04-01-1990 10.7  6.0 17.5    0
## 5 05-01-1990 12.6  7.3 20.8    0
## 6 06-01-1990 14.9  8.1 22.9    0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Delhi)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 30.1 26.5 33.2 14.7
## 11890 21-07-2022 28.6 26.8 30.6 21.2
## 11891 22-07-2022 29.3 27.0 32.9  0.3
## 11892 23-07-2022 30.1 25.5 34.9  8.9
## 11893 24-07-2022 30.6 27.1 35.7  0.0
## 11894 25-07-2022 30.7 26.8 35.7  0.0
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Delhi)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Delhi)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  9.4 9.3 9 10.7 12.6 14.9 14.4 10.7 13.4 16.6 ...
##  $ tmin: num  6 5.2 6.5 6 7.3 8.1 8.1 8.5 7 NA ...
##  $ tmax: num  15.1 14.2 13.6 17.5 20.8 22.9 21.4 16.6 20.6 22.8 ...
##  $ prcp: num  0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Delhi)
##      time                tavg           tmin            tmax      
##  Length:11894       Min.   : 6.6   Min.   : 0.10   Min.   : 9.80  
##  Class :character   1st Qu.:18.5   1st Qu.:11.80   1st Qu.:26.70  
##  Mode  :character   Median :27.0   Median :20.00   Median :33.20  
##                     Mean   :25.0   Mean   :18.88   Mean   :31.79  
##                     3rd Qu.:30.9   3rd Qu.:26.00   3rd Qu.:36.60  
##                     Max.   :39.8   Max.   :34.20   Max.   :48.10  
##                     NA's   :94     NA's   :1536    NA's   :533    
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  3.662  
##  3rd Qu.:  0.500  
##  Max.   :262.900  
##  NA's   :6140
sum(is.na(Weather_Delhi))
## [1] 8303
## About 8303 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Delhi) ## Total number of missing parameters
## [1] 8303
miss_var_summary(Weather_Delhi) ## Missingness summary
## # A tibble: 5 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <dbl>
## 1 prcp       6140   51.6  
## 2 tmin       1536   12.9  
## 3 tmax        533    4.48 
## 4 tavg         94    0.790
## 5 time          0    0
miss_var_span(Weather_Delhi, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     37        213     0.148         0.852       250
##  2            2     61        189     0.244         0.756       250
##  3            3     24        226     0.096         0.904       250
##  4            4    101        149     0.404         0.596       250
##  5            5     18        232     0.072         0.928       250
##  6            6     21        229     0.084         0.916       250
##  7            7     39        211     0.156         0.844       250
##  8            8     13        237     0.052         0.948       250
##  9            9     20        230     0.08          0.92        250
## 10           10     27        223     0.108         0.892       250
## # ℹ 38 more rows
miss_var_table(Weather_Delhi)
## # A tibble: 5 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1       20
## 2            94      1       20
## 3           533      1       20
## 4          1536      1       20
## 5          6140      1       20
vis_miss(Weather_Delhi) ## visualise % of missing

gg_miss_upset(Weather_Delhi) ## plot for missing data

gg_miss_fct(x = Weather_Delhi, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Delhi, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Delhi, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
##   Variable     n
##   <chr>    <int>
## 1 time         0
## 2 tavg         0
## 3 tmin         0
## 4 tmax         0
## 5 prcp         0
##Create shadow matrix data
head(as_shadow(Weather_Delhi))
## # A tibble: 6 × 5
##   time_NA tavg_NA tmin_NA tmax_NA prcp_NA
##   <fct>   <fct>   <fct>   <fct>   <fct>  
## 1 !NA     !NA     !NA     !NA     !NA    
## 2 !NA     !NA     !NA     !NA     !NA    
## 3 !NA     !NA     !NA     !NA     !NA    
## 4 !NA     !NA     !NA     !NA     !NA    
## 5 !NA     !NA     !NA     !NA     !NA    
## 6 !NA     !NA     !NA     !NA     !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Delhi, only_miss = TRUE))
## # A tibble: 6 × 9
##   time        tavg  tmin  tmax  prcp tavg_NA tmin_NA tmax_NA prcp_NA
##   <chr>      <dbl> <dbl> <dbl> <dbl> <fct>   <fct>   <fct>   <fct>  
## 1 01-01-1990   9.4   6    15.1     0 !NA     !NA     !NA     !NA    
## 2 02-01-1990   9.3   5.2  14.2     0 !NA     !NA     !NA     !NA    
## 3 03-01-1990   9     6.5  13.6     0 !NA     !NA     !NA     !NA    
## 4 04-01-1990  10.7   6    17.5     0 !NA     !NA     !NA     !NA    
## 5 05-01-1990  12.6   7.3  20.8     0 !NA     !NA     !NA     !NA    
## 6 06-01-1990  14.9   8.1  22.9     0 !NA     !NA     !NA     !NA
# Lets explore the relations ship with the missing values
Weather_Delhi %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          25.1    7.07
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Delhi) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 94 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Delhi, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Delhi, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
Weather_Delhi_imp <- impute_below_all(Weather_Delhi)
ggplot(Weather_Delhi_imp, aes(x = tavg, y = prcp)) + geom_miss_point()

# But we need to track the imputed values as well
Weather_Delhi_imp_track <- bind_shadow(Weather_Delhi) %>% impute_below_all()
ggplot(Weather_Delhi_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Delhi_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Delhi_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Delhi_imp_lm_temp <- Weather_Delhi %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()

ggplot(Weather_Delhi_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()

##Analysing and Performing Imputations on Lucknow_1990_2022.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Lucknow)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Lucknow)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 7.2, 10.5, 10.2, 9.1, 13.5, 11.5, 14.2, 17.1, 11.1, 14.8, 12.9, 1…
## $ tmin <dbl> NA, NA, 1.8, NA, NA, 5.9, 5.4, NA, NA, 4.1, 5.1, 7.3, NA, 6.9, 9.…
## $ tmax <dbl> 18.1, 17.2, 18.6, 19.3, 23.8, 21.4, 23.6, 24.6, 24.6, 23.6, 23.6,…
## $ prcp <dbl> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, …
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Lucknow)
##         time tavg tmin tmax prcp
## 1 01-01-1990  7.2   NA 18.1    0
## 2 02-01-1990 10.5   NA 17.2    0
## 3 03-01-1990 10.2  1.8 18.6   NA
## 4 04-01-1990  9.1   NA 19.3    0
## 5 05-01-1990 13.5   NA 23.8    0
## 6 06-01-1990 11.5  5.9 21.4    0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Lucknow)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 28.6 25.1 33.1 17.7
## 11890 21-07-2022 27.4 25.1 33.1 27.3
## 11891 22-07-2022 28.1 26.1 31.1 16.0
## 11892 23-07-2022 30.3 26.2 34.7 11.9
## 11893 24-07-2022 30.0 28.1 34.7  2.0
## 11894 25-07-2022 27.1 24.1 34.3  0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Lucknow)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Lucknow)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  7.2 10.5 10.2 9.1 13.5 11.5 14.2 17.1 11.1 14.8 ...
##  $ tmin: num  NA NA 1.8 NA NA 5.9 5.4 NA NA 4.1 ...
##  $ tmax: num  18.1 17.2 18.6 19.3 23.8 21.4 23.6 24.6 24.6 23.6 ...
##  $ prcp: num  0 0 NA 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Lucknow)
##      time                tavg            tmin           tmax      
##  Length:11894       Min.   : 5.70   Min.   :-0.6   Min.   :11.10  
##  Class :character   1st Qu.:19.50   1st Qu.:12.5   1st Qu.:28.10  
##  Mode  :character   Median :27.20   Median :20.5   Median :33.40  
##                     Mean   :25.22   Mean   :18.8   Mean   :32.49  
##                     3rd Qu.:30.40   3rd Qu.:25.1   3rd Qu.:36.50  
##                     Max.   :39.70   Max.   :32.7   Max.   :47.30  
##                     NA's   :138     NA's   :3515   NA's   :1553   
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  4.536  
##  3rd Qu.:  1.000  
##  Max.   :470.900  
##  NA's   :6152
sum(is.na(Weather_Lucknow))
## [1] 11358
## About 11358 entries are NA

## Lets analyse the missing data of the dataset
n_miss(Weather_Lucknow) ## Total number of missing parameters
## [1] 11358
miss_var_summary(Weather_Lucknow) ## Missingness summary
## # A tibble: 5 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <dbl>
## 1 prcp       6152    51.7 
## 2 tmin       3515    29.6 
## 3 tmax       1553    13.1 
## 4 tavg        138     1.16
## 5 time          0     0
miss_var_span(Weather_Lucknow, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     75        175     0.3           0.7         250
##  2            2     65        185     0.26          0.74        250
##  3            3     39        211     0.156         0.844       250
##  4            4     39        211     0.156         0.844       250
##  5            5     21        229     0.084         0.916       250
##  6            6     35        215     0.14          0.86        250
##  7            7     34        216     0.136         0.864       250
##  8            8     24        226     0.096         0.904       250
##  9            9     47        203     0.188         0.812       250
## 10           10     67        183     0.268         0.732       250
## # ℹ 38 more rows
miss_var_table(Weather_Lucknow)
## # A tibble: 5 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1       20
## 2           138      1       20
## 3          1553      1       20
## 4          3515      1       20
## 5          6152      1       20
vis_miss(Weather_Lucknow) ## visualise % of missing

gg_miss_upset(Weather_Lucknow) ## plot for missing data

gg_miss_fct(x = Weather_Lucknow, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Lucknow, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Lucknow, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
##   Variable     n
##   <chr>    <int>
## 1 time         0
## 2 tavg         0
## 3 tmin         0
## 4 tmax         0
## 5 prcp         0
##Create shadow matrix data
head(as_shadow(Weather_Lucknow))
## # A tibble: 6 × 5
##   time_NA tavg_NA tmin_NA tmax_NA prcp_NA
##   <fct>   <fct>   <fct>   <fct>   <fct>  
## 1 !NA     !NA     NA      !NA     !NA    
## 2 !NA     !NA     NA      !NA     !NA    
## 3 !NA     !NA     !NA     !NA     NA     
## 4 !NA     !NA     NA      !NA     !NA    
## 5 !NA     !NA     NA      !NA     !NA    
## 6 !NA     !NA     !NA     !NA     !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Lucknow, only_miss = TRUE))
## # A tibble: 6 × 9
##   time        tavg  tmin  tmax  prcp tavg_NA tmin_NA tmax_NA prcp_NA
##   <chr>      <dbl> <dbl> <dbl> <dbl> <fct>   <fct>   <fct>   <fct>  
## 1 01-01-1990   7.2  NA    18.1     0 !NA     NA      !NA     !NA    
## 2 02-01-1990  10.5  NA    17.2     0 !NA     NA      !NA     !NA    
## 3 03-01-1990  10.2   1.8  18.6    NA !NA     !NA     !NA     NA     
## 4 04-01-1990   9.1  NA    19.3     0 !NA     NA      !NA     !NA    
## 5 05-01-1990  13.5  NA    23.8     0 !NA     NA      !NA     !NA    
## 6 06-01-1990  11.5   5.9  21.4     0 !NA     !NA     !NA     !NA
# Lets explore the relations ship with the missing values
Weather_Lucknow %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          25.7    6.33
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Lucknow) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 138 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Lucknow, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Lucknow, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
Weather_Lucknow_imp <- impute_below_all(Weather_Lucknow)
ggplot(Weather_Lucknow_imp, aes(x = tavg, y = prcp)) + geom_miss_point()

# But we need to track the imputed values as well
Weather_Lucknow_imp_track <- bind_shadow(Weather_Lucknow) %>% impute_below_all()
ggplot(Weather_Lucknow_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Lucknow_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Lucknow_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Lucknow_imp_lm_temp <- Weather_Lucknow %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()

ggplot(Weather_Lucknow_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()

##Analysing and Performing Imputations on Mumbai_1990_2022_Santacruz.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Mumbai)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Mumbai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 23.2, 22.2, 21.8, 25.4, 26.5, 25.1, 26.0, 26.6, 25.1, 26.8, 25.6,…
## $ tmin <dbl> 17.0, 16.5, 16.3, 17.9, 19.3, 19.8, 18.9, 18.8, 19.0, 19.3, 18.5,…
## $ tmax <dbl> NA, 29.9, 30.7, 31.8, 33.7, 33.5, 33.7, 34.6, 34.4, 34.7, 34.0, 3…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Mumbai)
##         time tavg tmin tmax prcp
## 1 01-01-1990 23.2 17.0   NA    0
## 2 02-01-1990 22.2 16.5 29.9    0
## 3 03-01-1990 21.8 16.3 30.7    0
## 4 04-01-1990 25.4 17.9 31.8    0
## 5 05-01-1990 26.5 19.3 33.7    0
## 6 06-01-1990 25.1 19.8 33.5    0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Mumbai)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 27.4 25.0 30.5 11.9
## 11890 21-07-2022 27.6 25.6 30.5 10.9
## 11891 22-07-2022 28.3 26.0 30.5  3.0
## 11892 23-07-2022 28.2 25.8 31.3  5.1
## 11893 24-07-2022 28.1 25.6 30.4  7.1
## 11894 25-07-2022 28.3 25.1 30.2  7.1
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Mumbai)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Mumbai)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  23.2 22.2 21.8 25.4 26.5 25.1 26 26.6 25.1 26.8 ...
##  $ tmin: num  17 16.5 16.3 17.9 19.3 19.8 18.9 18.8 19 19.3 ...
##  $ tmax: num  NA 29.9 30.7 31.8 33.7 33.5 33.7 34.6 34.4 34.7 ...
##  $ prcp: num  0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Mumbai)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :17.70   Min.   : 8.50   Min.   :22.30  
##  Class :character   1st Qu.:26.60   1st Qu.:19.80   1st Qu.:30.90  
##  Mode  :character   Median :28.10   Median :23.70   Median :32.40  
##                     Mean   :27.76   Mean   :22.62   Mean   :32.31  
##                     3rd Qu.:29.30   3rd Qu.:25.40   3rd Qu.:33.90  
##                     Max.   :33.70   Max.   :30.40   Max.   :41.30  
##                     NA's   :11      NA's   :2454    NA's   :1907   
##       prcp       
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median :  0.00  
##  Mean   : 10.94  
##  3rd Qu.:  7.10  
##  Max.   :461.00  
##  NA's   :4681
sum(is.na(Weather_Mumbai))
## [1] 9053
## About 9053 entries are NA

## Lets analyse the missing data of the dataset
n_miss(Weather_Mumbai) ## Total number of missing parameters
## [1] 9053
miss_var_summary(Weather_Mumbai) ## Missingness summary
## # A tibble: 5 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <dbl>
## 1 prcp       4681  39.4   
## 2 tmin       2454  20.6   
## 3 tmax       1907  16.0   
## 4 tavg         11   0.0925
## 5 time          0   0
miss_var_span(Weather_Mumbai, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1      7        243     0.028         0.972       250
##  2            2     13        237     0.052         0.948       250
##  3            3     21        229     0.084         0.916       250
##  4            4      5        245     0.02          0.98        250
##  5            5      5        245     0.02          0.98        250
##  6            6     16        234     0.064         0.936       250
##  7            7      8        242     0.032         0.968       250
##  8            8      9        241     0.036         0.964       250
##  9            9      7        243     0.028         0.972       250
## 10           10      5        245     0.02          0.98        250
## # ℹ 38 more rows
miss_var_table(Weather_Mumbai)
## # A tibble: 5 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1       20
## 2            11      1       20
## 3          1907      1       20
## 4          2454      1       20
## 5          4681      1       20
vis_miss(Weather_Mumbai) ## visualise % of missing

gg_miss_upset(Weather_Mumbai) ## plot for missing data

gg_miss_fct(x = Weather_Mumbai, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Mumbai, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Mumbai, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
##   Variable     n
##   <chr>    <int>
## 1 time         0
## 2 tavg         0
## 3 tmin         0
## 4 tmax         0
## 5 prcp         0
##Create shadow matrix data
head(as_shadow(Weather_Mumbai))
## # A tibble: 6 × 5
##   time_NA tavg_NA tmin_NA tmax_NA prcp_NA
##   <fct>   <fct>   <fct>   <fct>   <fct>  
## 1 !NA     !NA     !NA     NA      !NA    
## 2 !NA     !NA     !NA     !NA     !NA    
## 3 !NA     !NA     !NA     !NA     !NA    
## 4 !NA     !NA     !NA     !NA     !NA    
## 5 !NA     !NA     !NA     !NA     !NA    
## 6 !NA     !NA     !NA     !NA     !NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Mumbai, only_miss = TRUE))
## # A tibble: 6 × 9
##   time        tavg  tmin  tmax  prcp tavg_NA tmin_NA tmax_NA prcp_NA
##   <chr>      <dbl> <dbl> <dbl> <dbl> <fct>   <fct>   <fct>   <fct>  
## 1 01-01-1990  23.2  17    NA       0 !NA     !NA     NA      !NA    
## 2 02-01-1990  22.2  16.5  29.9     0 !NA     !NA     !NA     !NA    
## 3 03-01-1990  21.8  16.3  30.7     0 !NA     !NA     !NA     !NA    
## 4 04-01-1990  25.4  17.9  31.8     0 !NA     !NA     !NA     !NA    
## 5 05-01-1990  26.5  19.3  33.7     0 !NA     !NA     !NA     !NA    
## 6 06-01-1990  25.1  19.8  33.5     0 !NA     !NA     !NA     !NA
# Lets explore the relations ship with the missing values
Weather_Mumbai %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          27.6    2.10
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Mumbai) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 11 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Mumbai, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Mumbai, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
Weather_Mumbai_imp <- impute_below_all(Weather_Mumbai)
ggplot(Weather_Mumbai_imp, aes(x = tavg, y = prcp)) + geom_miss_point()

# But we need to track the imputed values as well
Weather_Mumbai_imp_track <- bind_shadow(Weather_Mumbai) %>% impute_below_all()
ggplot(Weather_Mumbai_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Mumbai_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Mumbai_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Mumbai_imp_lm_temp <- Weather_Mumbai %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()

ggplot(Weather_Mumbai_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()

##Analysing and Performing Imputations on Rajasthan_1990_2022_Jodhpur.csv

## Have a look at the data


print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Jodhpur)
## [1] 11894     5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Jodhpur)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Jodhpur)
##         time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4   NA
## 2 02-01-1990 21.7   NA 26.5    0
## 3 03-01-1990 21.0 16.4 26.5    0
## 4 04-01-1990 20.8   NA 27.4    0
## 5 05-01-1990 20.4 14.2 26.1    0
## 6 06-01-1990 20.4 17.1 24.2   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Jodhpur)
##             time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8  0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9  0.0
## 11892 23-07-2022 23.1 20.9 26.7  0.0
## 11893 24-07-2022 22.8 20.0 26.7  0.3
## 11894 25-07-2022 24.1 20.2 28.5  0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Jodhpur)

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Jodhpur)
## 'data.frame':    11894 obs. of  5 variables:
##  $ time: chr  "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
##  $ tavg: num  22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
##  $ tmin: num  19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
##  $ tmax: num  28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
##  $ prcp: num  NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Jodhpur)
##      time                tavg            tmin            tmax      
##  Length:11894       Min.   :17.20   Min.   : 9.30   Min.   :19.80  
##  Class :character   1st Qu.:22.30   1st Qu.:18.10   1st Qu.:27.90  
##  Mode  :character   Median :23.50   Median :19.80   Median :29.50  
##                     Mean   :23.84   Mean   :19.39   Mean   :29.93  
##                     3rd Qu.:25.20   3rd Qu.:20.80   3rd Qu.:32.00  
##                     Max.   :32.40   Max.   :27.90   Max.   :39.20  
##                     NA's   :70      NA's   :1389    NA's   :629    
##       prcp        
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  4.414  
##  3rd Qu.:  2.000  
##  Max.   :271.300  
##  NA's   :4620
sum(is.na(Weather_Jodhpur))
## [1] 6708
## About 6708 entries are NA

## Lets analyse the missing data of the dataset
n_miss(Weather_Jodhpur) ## Total number of missing parameters
## [1] 6708
miss_var_summary(Weather_Jodhpur) ## Missingness summary
## # A tibble: 5 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <dbl>
## 1 prcp       4620   38.8  
## 2 tmin       1389   11.7  
## 3 tmax        629    5.29 
## 4 tavg         70    0.589
## 5 time          0    0
miss_var_span(Weather_Jodhpur, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     42        208     0.168         0.832       250
##  2            2     47        203     0.188         0.812       250
##  3            3     50        200     0.2           0.8         250
##  4            4     41        209     0.164         0.836       250
##  5            5     34        216     0.136         0.864       250
##  6            6     31        219     0.124         0.876       250
##  7            7     39        211     0.156         0.844       250
##  8            8     18        232     0.072         0.928       250
##  9            9     46        204     0.184         0.816       250
## 10           10     38        212     0.152         0.848       250
## # ℹ 38 more rows
miss_var_table(Weather_Jodhpur)
## # A tibble: 5 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1       20
## 2            70      1       20
## 3           629      1       20
## 4          1389      1       20
## 5          4620      1       20
vis_miss(Weather_Jodhpur) ## visualise % of missing

gg_miss_upset(Weather_Jodhpur) ## plot for missing data

gg_miss_fct(x = Weather_Jodhpur, fct = prcp) ## Heat map of missingness
## Warning: Removed 4 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Jodhpur, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Jodhpur, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 5 × 2
##   Variable     n
##   <chr>    <int>
## 1 time         0
## 2 tavg         0
## 3 tmin         0
## 4 tmax         0
## 5 prcp         0
##Create shadow matrix data
head(as_shadow(Weather_Jodhpur))
## # A tibble: 6 × 5
##   time_NA tavg_NA tmin_NA tmax_NA prcp_NA
##   <fct>   <fct>   <fct>   <fct>   <fct>  
## 1 !NA     !NA     !NA     !NA     NA     
## 2 !NA     !NA     NA      !NA     !NA    
## 3 !NA     !NA     !NA     !NA     !NA    
## 4 !NA     !NA     NA      !NA     !NA    
## 5 !NA     !NA     !NA     !NA     !NA    
## 6 !NA     !NA     !NA     !NA     NA
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Jodhpur, only_miss = TRUE))
## # A tibble: 6 × 9
##   time        tavg  tmin  tmax  prcp tavg_NA tmin_NA tmax_NA prcp_NA
##   <chr>      <dbl> <dbl> <dbl> <dbl> <fct>   <fct>   <fct>   <fct>  
## 1 01-01-1990  22.9  19.1  28.4    NA !NA     !NA     !NA     NA     
## 2 02-01-1990  21.7  NA    26.5     0 !NA     NA      !NA     !NA    
## 3 03-01-1990  21    16.4  26.5     0 !NA     !NA     !NA     !NA    
## 4 04-01-1990  20.8  NA    27.4     0 !NA     NA      !NA     !NA    
## 5 05-01-1990  20.4  14.2  26.1     0 !NA     !NA     !NA     !NA    
## 6 06-01-1990  20.4  17.1  24.2    NA !NA     !NA     !NA     NA
# Lets explore the relations ship with the missing values
Weather_Jodhpur %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          23.7    2.17
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Jodhpur) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 70 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Jodhpur, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Jodhpur, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
Weather_Jodhpur_imp <- impute_below_all(Weather_Jodhpur)
ggplot(Weather_Jodhpur_imp, aes(x = tavg, y = prcp)) + geom_miss_point()

# But we need to track the imputed values as well
Weather_Jodhpur_imp_track <- bind_shadow(Weather_Jodhpur) %>% impute_below_all()
ggplot(Weather_Jodhpur_imp_track, aes(x = prcp, fill = prcp_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Jodhpur_imp_track, aes(x = tmin, fill = tmin_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Weather_Jodhpur_imp_track, aes(x = tavg, y = prcp, color = prcp_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
Weather_Jodhpur_imp_lm_temp <- Weather_Jodhpur %>% bind_shadow() %>% impute_lm(prcp ~ tavg + tmin) %>% impute_lm(tmin ~ tavg) %>% add_label_shadow()

ggplot(Weather_Jodhpur_imp_lm_temp, aes(x = tavg, y = prcp, color = any_missing)) + geom_miss_point()

##Analysing weather_Bhubhneshwar_1990_2022.csv")

## Have a look at the data

#definitely has more columns than the cities that we have seen so far

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bhubhneshwar)
## [1] 11935    11
#OK, so we have 11 columns, 6 more than others

print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bhubhneshwar)
## Rows: 11,935
## Columns: 11
## $ time <chr> "1990-01-01", "1990-01-02", "1990-01-03", "1990-01-04", "1990-01-…
## $ tavg <dbl> 20.1, 20.7, 20.7, 18.8, 19.8, 22.2, 20.8, 20.3, 22.3, 21.6, 21.7,…
## $ tmin <dbl> NA, 16.4, 16.0, NA, 11.0, 12.5, NA, 13.6, 14.8, 14.5, 15.6, 12.8,…
## $ tmax <dbl> 28.0, NA, 27.4, 28.0, 28.2, NA, NA, 29.5, 31.6, 30.8, 30.7, 29.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, 0, …
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wspd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bhubhneshwar)
##         time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 1 1990-01-01 20.1   NA 28.0    0   NA   NA   NA   NA   NA   NA
## 2 1990-01-02 20.7 16.4   NA    0   NA   NA   NA   NA   NA   NA
## 3 1990-01-03 20.7 16.0 27.4    0   NA   NA   NA   NA   NA   NA
## 4 1990-01-04 18.8   NA 28.0    0   NA   NA   NA   NA   NA   NA
## 5 1990-01-05 19.8 11.0 28.2    0   NA   NA   NA   NA   NA   NA
## 6 1990-01-06 22.2 12.5   NA    0   NA   NA   NA   NA   NA   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bhubhneshwar)
##             time tavg tmin tmax prcp snow wdir wspd wpgt   pres tsun
## 11930 2022-08-30 30.0 27.0 34.0  1.2   NA  169  8.3   NA 1007.6   NA
## 11931 2022-08-31 29.2 26.3 33.0  9.0   NA  186  8.2   NA 1006.6   NA
## 11932 2022-09-01 29.6 27.0 33.0  2.1   NA  190  9.5   NA 1006.8   NA
## 11933 2022-09-02 29.7 26.3 33.0  3.3   NA  198  9.5   NA 1007.3   NA
## 11934 2022-09-03 29.2 26.1 34.0  9.7   NA  215  8.5   NA 1005.5   NA
## 11935 2022-09-04 27.6 25.9 31.6 12.8   NA  214  8.6   NA 1004.9   NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Bhubhneshwar)
##  [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bhubhneshwar)
## 'data.frame':    11935 obs. of  11 variables:
##  $ time: chr  "1990-01-01" "1990-01-02" "1990-01-03" "1990-01-04" ...
##  $ tavg: num  20.1 20.7 20.7 18.8 19.8 22.2 20.8 20.3 22.3 21.6 ...
##  $ tmin: num  NA 16.4 16 NA 11 12.5 NA 13.6 14.8 14.5 ...
##  $ tmax: num  28 NA 27.4 28 28.2 NA NA 29.5 31.6 30.8 ...
##  $ prcp: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ snow: logi  NA NA NA NA NA NA ...
##  $ wdir: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ wspd: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ wpgt: logi  NA NA NA NA NA NA ...
##  $ pres: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tsun: logi  NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bhubhneshwar)
##      time                tavg            tmin            tmax     
##  Length:11935       Min.   :15.70   Min.   : 8.20   Min.   :19.4  
##  Class :character   1st Qu.:24.70   1st Qu.:19.00   1st Qu.:30.4  
##  Mode  :character   Median :27.70   Median :24.00   Median :32.8  
##                     Mean   :26.99   Mean   :22.24   Mean   :33.0  
##                     3rd Qu.:29.40   3rd Qu.:25.60   3rd Qu.:35.4  
##                     Max.   :37.40   Max.   :31.80   Max.   :46.7  
##                     NA's   :78      NA's   :2090    NA's   :891   
##       prcp           snow              wdir            wspd       
##  Min.   :  0.000   Mode:logical   Min.   :  0.0   Min.   : 0.500  
##  1st Qu.:  0.000   NA's:11935     1st Qu.: 89.0   1st Qu.: 4.500  
##  Median :  0.000                  Median :188.0   Median : 7.000  
##  Mean   :  7.074                  Mean   :169.1   Mean   : 8.399  
##  3rd Qu.:  4.100                  3rd Qu.:220.8   3rd Qu.:11.000  
##  Max.   :470.900                  Max.   :359.0   Max.   :33.100  
##  NA's   :5097                     NA's   :10641   NA's   :9806    
##    wpgt              pres          tsun        
##  Mode:logical   Min.   : 990.6   Mode:logical  
##  NA's:11935     1st Qu.:1002.9   NA's:11935    
##                 Median :1007.3                 
##                 Mean   :1007.4                 
##                 3rd Qu.:1012.4                 
##                 Max.   :1019.3                 
##                 NA's   :10692
sum(is.na(Weather_Bhubhneshwar))
## [1] 75100
## About 75100 entries are NA
## Lets analyse the missing data of the dataset
n_miss(Weather_Bhubhneshwar) ## Total number of missing parameters
## [1] 75100
miss_var_summary(Weather_Bhubhneshwar) ## Missingness summary
## # A tibble: 11 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 snow      11935  100    
##  2 wpgt      11935  100    
##  3 tsun      11935  100    
##  4 pres      10692   89.6  
##  5 wdir      10641   89.2  
##  6 wspd       9806   82.2  
##  7 prcp       5097   42.7  
##  8 tmin       2090   17.5  
##  9 tmax        891    7.47 
## 10 tavg         78    0.654
## 11 time          0    0
miss_var_span(Weather_Bhubhneshwar, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 48 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     48        202     0.192         0.808       250
##  2            2     39        211     0.156         0.844       250
##  3            3     30        220     0.12          0.88        250
##  4            4     36        214     0.144         0.856       250
##  5            5     33        217     0.132         0.868       250
##  6            6     27        223     0.108         0.892       250
##  7            7     32        218     0.128         0.872       250
##  8            8     28        222     0.112         0.888       250
##  9            9     36        214     0.144         0.856       250
## 10           10     34        216     0.136         0.864       250
## # ℹ 38 more rows
miss_var_table(Weather_Bhubhneshwar)
## # A tibble: 9 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1     9.09
## 2            78      1     9.09
## 3           891      1     9.09
## 4          2090      1     9.09
## 5          5097      1     9.09
## 6          9806      1     9.09
## 7         10641      1     9.09
## 8         10692      1     9.09
## 9         11935      3    27.3
vis_miss(Weather_Bhubhneshwar) ## visualise % of missing

gg_miss_upset(Weather_Bhubhneshwar) ## plot for missing data

gg_miss_fct(x = Weather_Bhubhneshwar, fct = prcp) ## Heat map of missingness
## Warning: Removed 10 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Bhubhneshwar, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = Weather_Bhubhneshwar, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 11 × 2
##    Variable     n
##    <chr>    <int>
##  1 time         0
##  2 tavg         0
##  3 tmin         0
##  4 tmax         0
##  5 prcp         0
##  6 snow         0
##  7 wdir         0
##  8 wspd         0
##  9 wpgt         0
## 10 pres         0
## 11 tsun         0
##Create shadow matrix data
head(as_shadow(Weather_Bhubhneshwar))
## # A tibble: 6 × 11
##   time_NA tavg_NA tmin_NA tmax_NA prcp_NA snow_NA wdir_NA wspd_NA wpgt_NA
##   <fct>   <fct>   <fct>   <fct>   <fct>   <fct>   <fct>   <fct>   <fct>  
## 1 !NA     !NA     NA      !NA     !NA     NA      NA      NA      NA     
## 2 !NA     !NA     !NA     NA      !NA     NA      NA      NA      NA     
## 3 !NA     !NA     !NA     !NA     !NA     NA      NA      NA      NA     
## 4 !NA     !NA     NA      !NA     !NA     NA      NA      NA      NA     
## 5 !NA     !NA     !NA     !NA     !NA     NA      NA      NA      NA     
## 6 !NA     !NA     !NA     NA      !NA     NA      NA      NA      NA     
## # ℹ 2 more variables: pres_NA <fct>, tsun_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(Weather_Bhubhneshwar, only_miss = TRUE))
## # A tibble: 6 × 21
##   time        tavg  tmin  tmax  prcp snow   wdir  wspd wpgt   pres tsun  tavg_NA
##   <chr>      <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl> <lgl> <dbl> <lgl> <fct>  
## 1 1990-01-01  20.1  NA    28       0 NA       NA    NA NA       NA NA    !NA    
## 2 1990-01-02  20.7  16.4  NA       0 NA       NA    NA NA       NA NA    !NA    
## 3 1990-01-03  20.7  16    27.4     0 NA       NA    NA NA       NA NA    !NA    
## 4 1990-01-04  18.8  NA    28       0 NA       NA    NA NA       NA NA    !NA    
## 5 1990-01-05  19.8  11    28.2     0 NA       NA    NA NA       NA NA    !NA    
## 6 1990-01-06  22.2  12.5  NA       0 NA       NA    NA NA       NA NA    !NA    
## # ℹ 9 more variables: tmin_NA <fct>, tmax_NA <fct>, prcp_NA <fct>,
## #   snow_NA <fct>, wdir_NA <fct>, wspd_NA <fct>, wpgt_NA <fct>, pres_NA <fct>,
## #   tsun_NA <fct>
# Lets explore the relations ship with the missing values
Weather_Bhubhneshwar %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          27.2    3.29
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Bhubhneshwar) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 78 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Bhubhneshwar, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Bhubhneshwar, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

# Looks like there are not too much of missing data

##Analysing weather_Rourkela_2021_2022.csv")

## Have a look at the data
#definitely has more columns than the cities that we have seen so far

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Rourkela)
## [1] 426  11
#OK, so we have 11 columns, 6 more than others

print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Rourkela)
## Rows: 426
## Columns: 11
## $ time <chr> "2021-07-06", "2021-07-07", "2021-07-08", "2021-07-09", "2021-07-…
## $ tavg <dbl> 29.3, 29.7, 27.4, 28.5, 29.0, 29.3, 28.9, 28.6, 29.0, 29.5, 29.6,…
## $ tmin <dbl> 26.2, 27.3, 25.8, 26.1, 26.2, 26.2, 25.7, 25.5, 25.4, 25.5, 26.3,…
## $ tmax <dbl> 32.6, 33.4, 29.7, 32.1, 32.6, 33.7, 32.9, 32.5, 32.7, 33.4, 33.2,…
## $ prcp <dbl> NA, 11.1, 66.9, 11.4, 2.7, 10.8, 5.4, 10.1, 1.9, 1.3, 1.1, 6.0, 8…
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> 197, 199, 186, 173, 121, 70, 95, 101, 138, 152, 179, 181, 181, 19…
## $ wspd <dbl> 6.8, 6.9, 6.3, 3.9, 4.6, 5.8, 7.0, 5.5, 6.5, 8.7, 9.5, 8.3, 8.0, …
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> 1002.5, 1002.2, 1001.8, 1001.0, 1000.9, 1002.2, 1003.4, 1002.8, 1…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Rourkela)
##         time tavg tmin tmax prcp snow wdir wspd wpgt   pres tsun
## 1 2021-07-06 29.3 26.2 32.6   NA   NA  197  6.8   NA 1002.5   NA
## 2 2021-07-07 29.7 27.3 33.4 11.1   NA  199  6.9   NA 1002.2   NA
## 3 2021-07-08 27.4 25.8 29.7 66.9   NA  186  6.3   NA 1001.8   NA
## 4 2021-07-09 28.5 26.1 32.1 11.4   NA  173  3.9   NA 1001.0   NA
## 5 2021-07-10 29.0 26.2 32.6  2.7   NA  121  4.6   NA 1000.9   NA
## 6 2021-07-11 29.3 26.2 33.7 10.8   NA   70  5.8   NA 1002.2   NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Rourkela)
##           time tavg tmin tmax prcp snow wdir wspd wpgt   pres tsun
## 421 2022-08-30 29.8 26.4 34.3  0.0   NA  174  7.6   NA 1007.9   NA
## 422 2022-08-31 29.0 26.6 33.5  2.0   NA  187  8.6   NA 1006.8   NA
## 423 2022-09-01 29.1 25.7 33.2 11.5   NA  205  6.7   NA 1007.2   NA
## 424 2022-09-02 29.4 26.4 33.7  1.5   NA  189  7.0   NA 1007.5   NA
## 425 2022-09-03 28.7 26.6 32.6  8.0   NA  203  8.0   NA 1005.8   NA
## 426 2022-09-04 28.2 25.9 31.8 17.7   NA  211  6.8   NA 1004.8   NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Rourkela)
##  [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami

print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Rourkela)
## 'data.frame':    426 obs. of  11 variables:
##  $ time: chr  "2021-07-06" "2021-07-07" "2021-07-08" "2021-07-09" ...
##  $ tavg: num  29.3 29.7 27.4 28.5 29 29.3 28.9 28.6 29 29.5 ...
##  $ tmin: num  26.2 27.3 25.8 26.1 26.2 26.2 25.7 25.5 25.4 25.5 ...
##  $ tmax: num  32.6 33.4 29.7 32.1 32.6 33.7 32.9 32.5 32.7 33.4 ...
##  $ prcp: num  NA 11.1 66.9 11.4 2.7 10.8 5.4 10.1 1.9 1.3 ...
##  $ snow: logi  NA NA NA NA NA NA ...
##  $ wdir: num  197 199 186 173 121 70 95 101 138 152 ...
##  $ wspd: num  6.8 6.9 6.3 3.9 4.6 5.8 7 5.5 6.5 8.7 ...
##  $ wpgt: logi  NA NA NA NA NA NA ...
##  $ pres: num  1002 1002 1002 1001 1001 ...
##  $ tsun: logi  NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Rourkela)
##      time                tavg            tmin            tmax      
##  Length:426         Min.   :14.60   Min.   : 8.20   Min.   :21.50  
##  Class :character   1st Qu.:24.40   1st Qu.:18.18   1st Qu.:29.60  
##  Mode  :character   Median :28.10   Median :25.20   Median :32.10  
##                     Mean   :26.71   Mean   :22.30   Mean   :32.25  
##                     3rd Qu.:29.30   3rd Qu.:26.10   3rd Qu.:33.80  
##                     Max.   :35.00   Max.   :29.30   Max.   :43.60  
##                     NA's   :2       NA's   :2       NA's   :2      
##       prcp           snow              wdir            wspd       
##  Min.   :  0.000   Mode:logical   Min.   :  0.0   Min.   : 2.900  
##  1st Qu.:  0.000   NA's:426       1st Qu.: 49.0   1st Qu.: 5.500  
##  Median :  0.200                  Median :168.0   Median : 6.600  
##  Mean   :  5.695                  Mean   :140.3   Mean   : 7.441  
##  3rd Qu.:  7.200                  3rd Qu.:195.2   3rd Qu.: 8.725  
##  Max.   :123.000                  Max.   :359.0   Max.   :20.400  
##  NA's   :3                        NA's   :2       NA's   :2       
##    wpgt              pres          tsun        
##  Mode:logical   Min.   : 993.1   Mode:logical  
##  NA's:426       1st Qu.:1002.5   NA's:426      
##                 Median :1005.5                 
##                 Mean   :1006.8                 
##                 3rd Qu.:1012.1                 
##                 Max.   :1020.6                 
##                 NA's   :2
sum(is.na(Weather_Rourkela))
## [1] 1293
## About 1293 entries are NA

## Lets analyse the missing data of the dataset
n_miss(Weather_Rourkela) ## Total number of missing parameters
## [1] 1293
miss_var_summary(Weather_Rourkela) ## Missingness summary
## # A tibble: 11 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 snow        426  100    
##  2 wpgt        426  100    
##  3 tsun        426  100    
##  4 prcp          3    0.704
##  5 tavg          2    0.469
##  6 tmin          2    0.469
##  7 tmax          2    0.469
##  8 wdir          2    0.469
##  9 wspd          2    0.469
## 10 pres          2    0.469
## 11 time          0    0
miss_var_span(Weather_Rourkela, var = prcp, span_every = 250) ## Missingness spread
## # A tibble: 2 × 6
##   span_counter n_miss n_complete prop_miss prop_complete n_in_span
##          <int>  <int>      <int>     <dbl>         <dbl>     <int>
## 1            1      1        249    0.004          0.996       250
## 2            2      2        174    0.0114         0.989       176
miss_var_table(Weather_Rourkela)
## # A tibble: 4 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      1     9.09
## 2             2      6    54.5 
## 3             3      1     9.09
## 4           426      3    27.3
vis_miss(Weather_Rourkela) ## visualise % of missing

gg_miss_upset(Weather_Rourkela) ## plot for missing data

gg_miss_fct(x = Weather_Rourkela, fct = prcp) ## Heat map of missingness
## Warning: Removed 10 rows containing missing values (`geom_tile()`).

gg_miss_span(Weather_Rourkela, var = prcp, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
Weather_Rourkela
##           time tavg tmin tmax  prcp snow wdir wspd wpgt   pres tsun
## 1   2021-07-06 29.3 26.2 32.6    NA   NA  197  6.8   NA 1002.5   NA
## 2   2021-07-07 29.7 27.3 33.4  11.1   NA  199  6.9   NA 1002.2   NA
## 3   2021-07-08 27.4 25.8 29.7  66.9   NA  186  6.3   NA 1001.8   NA
## 4   2021-07-09 28.5 26.1 32.1  11.4   NA  173  3.9   NA 1001.0   NA
## 5   2021-07-10 29.0 26.2 32.6   2.7   NA  121  4.6   NA 1000.9   NA
## 6   2021-07-11 29.3 26.2 33.7  10.8   NA   70  5.8   NA 1002.2   NA
## 7   2021-07-12 28.9 25.7 32.9   5.4   NA   95  7.0   NA 1003.4   NA
## 8   2021-07-13 28.6 25.5 32.5  10.1   NA  101  5.5   NA 1002.8   NA
## 9   2021-07-14 29.0 25.4 32.7   1.9   NA  138  6.5   NA 1002.7   NA
## 10  2021-07-15 29.5 25.5 33.4   1.3   NA  152  8.7   NA 1004.0   NA
## 11  2021-07-16 29.6 26.3 33.2   1.1   NA  179  9.5   NA 1006.1   NA
## 12  2021-07-17 29.8 27.0 32.9   6.0   NA  181  8.3   NA 1004.9   NA
## 13  2021-07-18 29.6 27.4 32.3   8.9   NA  181  8.0   NA 1002.8   NA
## 14  2021-07-19 29.0 27.1 31.5  12.1   NA  194  6.6   NA 1001.2   NA
## 15  2021-07-20 28.7 26.8 31.9  28.7   NA  254  5.6   NA 1001.1   NA
## 16  2021-07-21 28.4 26.2 31.7  10.1   NA  230  5.1   NA  998.5   NA
## 17  2021-07-22 28.8 25.5 32.8  17.3   NA   55  6.9   NA  995.8   NA
## 18  2021-07-23 27.5 26.5 29.0  48.3   NA    5  8.9   NA  994.6   NA
## 19  2021-07-24 28.5 26.2 32.3  12.4   NA  203  9.2   NA  996.8   NA
## 20  2021-07-25 29.4 26.0 33.0   4.7   NA  203  9.4   NA  999.7   NA
## 21  2021-07-26 29.5 26.9 33.0   1.8   NA  237 11.0   NA  999.5   NA
## 22  2021-07-27 27.9 26.5 30.5  20.9   NA  244 12.8   NA  998.3   NA
## 23  2021-07-28 27.1 26.0 29.2   9.9   NA  257 15.5   NA  998.5   NA
## 24  2021-07-29 27.4 25.8 30.6   6.4   NA  260 13.9   NA  999.0   NA
## 25  2021-07-30 26.9 25.8 28.7  17.1   NA  250 17.4   NA  998.7   NA
## 26  2021-07-31 27.1 25.2 30.7  16.0   NA  198 14.1   NA  999.6   NA
## 27  2021-08-01 28.6 25.7 32.7   2.6   NA  208 10.5   NA 1000.4   NA
## 28  2021-08-02 28.6 25.1 33.1   0.8   NA  224  9.0   NA 1001.6   NA
## 29  2021-08-03 28.5 25.7 31.8   0.4   NA  243  9.9   NA 1002.8   NA
## 30  2021-08-04 27.7 26.2 30.4  13.1   NA  221  5.1   NA 1003.4   NA
## 31  2021-08-05 28.3 26.3 31.5   8.2   NA  210  5.5   NA 1001.4   NA
## 32  2021-08-06 28.2 25.8 32.2  19.3   NA  201  5.5   NA 1001.2   NA
## 33  2021-08-07 28.5 26.5 31.9  17.5   NA  201  6.8   NA 1003.4   NA
## 34  2021-08-08 28.2 26.2 31.5  11.2   NA  193  6.5   NA 1005.2   NA
## 35  2021-08-09 29.1 26.4 32.6  10.6   NA  202  8.3   NA 1005.0   NA
## 36  2021-08-10 29.3 26.8 33.2   9.5   NA  223  9.1   NA 1002.9   NA
## 37  2021-08-11 28.9 26.4 32.0  13.8   NA  233  8.1   NA 1002.9   NA
## 38  2021-08-12 29.0 26.0 32.6   3.5   NA  199  6.2   NA 1004.7   NA
## 39  2021-08-13 29.6 26.7 33.2   1.0   NA  204  5.5   NA 1003.6   NA
## 40  2021-08-14 29.7 26.6 33.9  13.2   NA  209  5.1   NA 1001.8   NA
## 41  2021-08-15 28.8 26.1 33.2  17.7   NA  331  4.6   NA 1002.5   NA
## 42  2021-08-16 29.0 26.4 33.1  13.4   NA  193  3.7   NA 1004.1   NA
## 43  2021-08-17 29.1 26.5 32.9   9.8   NA   56  5.4   NA 1003.4   NA
## 44  2021-08-18 27.6 26.3 29.8   6.4   NA  171  7.9   NA 1002.1   NA
## 45  2021-08-19 28.0 25.9 31.4   9.0   NA  176  9.0   NA 1002.5   NA
## 46  2021-08-20 27.7 25.4 31.3   7.4   NA  169  9.4   NA 1004.8   NA
## 47  2021-08-21 28.5 25.9 31.9   3.8   NA  176 11.6   NA 1007.3   NA
## 48  2021-08-22 28.6 25.9 32.4   3.2   NA  192  7.3   NA 1007.1   NA
## 49  2021-08-23 29.2 26.5 33.3   5.0   NA  197  9.5   NA 1005.8   NA
## 50  2021-08-24 29.0 26.3 32.3   9.0   NA  216  6.2   NA 1004.9   NA
## 51  2021-08-25 29.2 26.7 33.4  20.9   NA  210  6.6   NA 1002.6   NA
## 52  2021-08-26 28.3 26.8 31.8   5.7   NA  217  5.6   NA 1002.0   NA
## 53  2021-08-27 27.6 26.2 29.5   4.2   NA    9  3.4   NA 1003.2   NA
## 54  2021-08-28 28.7 26.1 33.0   9.7   NA   58  7.2   NA 1001.3   NA
## 55  2021-08-29 28.4 25.7 32.8   8.3   NA   77  5.7   NA 1001.7   NA
## 56  2021-08-30 28.4 25.2 32.2   5.4   NA  111  7.1   NA 1004.6   NA
## 57  2021-08-31 28.4 25.6 32.6   6.7   NA  137  7.3   NA 1007.0   NA
## 58  2021-09-01 28.8 24.8 32.9   0.8   NA  164  8.1   NA 1007.5   NA
## 59  2021-09-02 28.8 26.0 33.3   5.0   NA  176  5.5   NA 1006.3   NA
## 60  2021-09-03 28.3 25.7 31.9  18.5   NA  132  3.9   NA 1004.7   NA
## 61  2021-09-04 28.6 25.7 32.7  10.6   NA  184  3.5   NA 1003.6   NA
## 62  2021-09-05 29.1 25.9 33.4   3.6   NA   76  4.4   NA 1003.2   NA
## 63  2021-09-06 28.4 26.2 32.0   8.7   NA   46 13.5   NA 1002.9   NA
## 64  2021-09-07 28.8 26.3 32.0   3.4   NA  107 10.4   NA 1003.7   NA
## 65  2021-09-08 27.8 25.7 29.9   4.9   NA  141  5.3   NA 1005.4   NA
## 66  2021-09-09 27.9 25.8 30.6   8.0   NA  176  4.2   NA 1005.2   NA
## 67  2021-09-10 28.2 25.7 31.3   4.9   NA   72  3.2   NA 1004.2   NA
## 68  2021-09-11 27.0 25.3 31.2  38.7   NA   48  7.3   NA 1002.6   NA
## 69  2021-09-12 28.2 25.7 32.5   4.3   NA   35 11.2   NA  999.7   NA
## 70  2021-09-13 27.1 26.3 29.0  48.6   NA   33 20.4   NA  994.2   NA
## 71  2021-09-14 27.2 26.1 30.3 113.4   NA  162 19.0   NA  998.9   NA
## 72  2021-09-15 27.4 25.5 30.3  16.7   NA  165 12.1   NA 1004.2   NA
## 73  2021-09-16 27.9 25.3 31.4   8.4   NA  152  5.6   NA 1005.0   NA
## 74  2021-09-17 27.9 25.1 31.6  14.9   NA  181  5.5   NA 1004.7   NA
## 75  2021-09-18 28.5 25.5 32.2   1.6   NA  198  5.6   NA 1006.4   NA
## 76  2021-09-19 27.3 25.6 30.2   7.4   NA  191  5.2   NA 1008.0   NA
## 77  2021-09-20 27.4 25.2 30.8   6.6   NA  187  2.9   NA 1006.6   NA
## 78  2021-09-21 27.0 25.0 30.4  11.6   NA  337  4.6   NA 1004.3   NA
## 79  2021-09-22 26.8 24.9 29.6  13.1   NA  225  6.9   NA 1003.0   NA
## 80  2021-09-23 27.2 25.1 30.0  10.4   NA  137  5.3   NA 1005.5   NA
## 81  2021-09-24 28.2 23.8 32.8   0.2   NA  140  5.9   NA 1006.9   NA
## 82  2021-09-25 28.7 25.1 32.6   0.5   NA  110  4.5   NA 1005.6   NA
## 83  2021-09-26 28.6 26.1 33.0   1.5   NA   49 11.9   NA 1003.8   NA
## 84  2021-09-27 27.9 25.6 31.9   4.5   NA   72  9.6   NA 1004.2   NA
## 85  2021-09-28 27.2 24.8 30.8  11.2   NA   67  5.2   NA 1006.3   NA
## 86  2021-09-29 27.2 25.2 30.4  14.9   NA  226  5.5   NA 1006.6   NA
## 87  2021-09-30 26.8 25.0 29.9  17.9   NA  184 10.2   NA 1008.2   NA
## 88  2021-10-01 27.5 24.8 31.5  17.2   NA  187  8.3   NA 1008.5   NA
## 89  2021-10-02 28.3 25.9 31.8  12.7   NA  210  7.2   NA 1008.4   NA
## 90  2021-10-03 28.3 25.3 32.4  11.8   NA  201  6.3   NA 1007.4   NA
## 91  2021-10-04 28.4 24.8 32.2   6.0   NA  191  4.8   NA 1007.0   NA
## 92  2021-10-05 28.1 25.4 32.3   1.2   NA  127  3.5   NA 1008.5   NA
## 93  2021-10-06 28.5 25.2 32.9   0.8   NA  177  4.1   NA 1007.8   NA
## 94  2021-10-07 29.1 26.0 32.8   0.9   NA  207  5.6   NA 1006.4   NA
## 95  2021-10-08 29.0 25.8 33.4   0.8   NA  191  4.6   NA 1005.6   NA
## 96  2021-10-09 28.8 24.9 33.2   0.0   NA   52  4.8   NA 1005.8   NA
## 97  2021-10-10 27.8 23.0 32.9   0.0   NA  355  4.5   NA 1006.5   NA
## 98  2021-10-11 27.7 22.5 33.0   0.0   NA   23  5.2   NA 1006.0   NA
## 99  2021-10-12 28.3 23.5 33.1   0.0   NA    0  5.0   NA 1005.5   NA
## 100 2021-10-13 28.9 24.2 33.8   0.0   NA   14  5.8   NA 1004.8   NA
## 101 2021-10-14 28.1 24.2 32.8   1.8   NA   31  7.7   NA 1004.7   NA
## 102 2021-10-15 28.7 25.8 33.8   1.2   NA   76  6.9   NA 1003.4   NA
## 103 2021-10-16 27.7 24.8 32.8   5.6   NA   47  6.7   NA 1004.4   NA
## 104 2021-10-17 26.9 25.0 31.5  20.0   NA   53  6.9   NA 1005.3   NA
## 105 2021-10-18 26.2 24.2 28.7  20.9   NA  202  6.1   NA 1004.7   NA
## 106 2021-10-19 26.9 25.0 30.2  15.3   NA  182 10.4   NA 1007.9   NA
## 107 2021-10-20 27.0 24.6 30.9   0.8   NA  237  6.3   NA 1010.6   NA
## 108 2021-10-21 26.0 21.9 31.0   0.0   NA  253  5.2   NA 1011.1   NA
## 109 2021-10-22 25.0 19.8 31.6   0.0   NA  210  4.5   NA 1012.0   NA
## 110 2021-10-23 24.8 18.7 31.5   0.0   NA   25  3.9   NA 1013.5   NA
## 111 2021-10-24 24.7 19.2 30.8   0.0   NA   25  4.4   NA 1013.9   NA
## 112 2021-10-25 24.5 18.6 30.5   0.0   NA   12  5.3   NA 1012.5   NA
## 113 2021-10-26 24.3 18.8 30.2   0.0   NA   16  6.3   NA 1011.8   NA
## 114 2021-10-27 25.2 20.4 30.3   0.0   NA   37  8.6   NA 1012.8   NA
## 115 2021-10-28 25.3 20.9 30.4   0.0   NA   40  9.1   NA 1013.3   NA
## 116 2021-10-29 24.9 20.4 30.5   0.0   NA   42  8.3   NA 1014.6   NA
## 117 2021-10-30 25.1 20.1 30.8   0.0   NA   43  7.1   NA 1014.5   NA
## 118 2021-10-31 25.1 20.3 30.9   0.0   NA   13  6.7   NA 1013.8   NA
## 119 2021-11-01 24.5 19.7 30.3   0.0   NA  358  4.3   NA 1013.8   NA
## 120 2021-11-02 25.1 19.9 30.1   0.0   NA   68  5.6   NA 1014.4   NA
## 121 2021-11-03 24.9 21.0 30.1   0.0   NA   72  4.7   NA 1013.8   NA
## 122 2021-11-04 24.2 19.1 30.2   0.0   NA   41  4.7   NA 1013.2   NA
## 123 2021-11-05 23.9 18.7 30.0   0.0   NA   49  5.4   NA 1013.0   NA
## 124 2021-11-06 22.5 16.7 29.4   0.0   NA   28  5.6   NA 1011.7   NA
## 125 2021-11-07 22.3 15.7 29.3   0.0   NA   25  5.6   NA 1011.9   NA
## 126 2021-11-08 22.2 15.7 29.8   0.0   NA   23  7.0   NA 1012.7   NA
## 127 2021-11-09 22.5 15.4 30.0   0.0   NA   29  6.5   NA 1013.3   NA
## 128 2021-11-10 23.0 15.9 30.5   0.0   NA   20  6.3   NA 1012.5   NA
## 129 2021-11-11 24.6 18.3 31.1   0.0   NA   36  9.0   NA 1011.3   NA
## 130 2021-11-12 25.8 21.6 31.6   9.7   NA   55  5.8   NA 1011.8   NA
## 131 2021-11-13 24.7 23.2 27.1  34.4   NA   34  5.4   NA 1012.1   NA
## 132 2021-11-14 24.6 23.2 28.0  34.4   NA   53  4.2   NA 1011.5   NA
## 133 2021-11-15 24.2 21.0 27.2  14.4   NA   83  7.1   NA 1010.5   NA
## 134 2021-11-16 23.7 20.4 28.7   0.0   NA   43  8.1   NA 1011.5   NA
## 135 2021-11-17 23.0 18.3 28.6   0.0   NA   37  8.3   NA 1013.5   NA
## 136 2021-11-18 22.3 18.0 28.4   0.0   NA   26  6.5   NA 1012.2   NA
## 137 2021-11-19 22.9 17.7 29.0   0.0   NA   37  5.2   NA 1010.4   NA
## 138 2021-11-20 25.3 20.5 30.5   0.0   NA   58  4.9   NA 1010.2   NA
## 139 2021-11-21 25.4 21.6 30.4   0.0   NA   57  4.8   NA 1011.5   NA
## 140 2021-11-22 25.5 22.2 29.9   0.1   NA  194  4.2   NA 1011.7   NA
## 141 2021-11-23 24.7 20.1 30.3   0.0   NA   15  4.0   NA 1011.5   NA
## 142 2021-11-24 23.6 19.1 29.6   0.0   NA   21  5.5   NA 1012.9   NA
## 143 2021-11-25 23.1 17.9 29.8   0.0   NA   37  4.5   NA 1013.2   NA
## 144 2021-11-26 22.4 17.7 28.9   0.0   NA   31  5.1   NA 1013.7   NA
## 145 2021-11-27 21.2 16.5 28.0   0.0   NA   24  6.9   NA 1015.1   NA
## 146 2021-11-28 19.7 13.6 26.9   0.0   NA   25  6.5   NA 1016.3   NA
## 147 2021-11-29 19.6 13.9 27.0   0.0   NA   15  5.7   NA 1016.2   NA
## 148 2021-11-30 19.7 14.2 26.9   0.0   NA   24  7.5   NA 1015.9   NA
## 149 2021-12-01 19.7 13.4 27.3   0.0   NA   33  8.2   NA 1015.4   NA
## 150 2021-12-02 20.2 13.5 27.6   0.0   NA    9  6.2   NA 1014.7   NA
## 151 2021-12-03 21.1 14.7 27.7   0.0   NA   24  6.9   NA 1015.0   NA
## 152 2021-12-04 21.6 20.3 23.3   0.7   NA   32  8.4   NA 1014.7   NA
## 153 2021-12-05 21.6 20.4 24.5   4.7   NA   40 14.5   NA 1013.5   NA
## 154 2021-12-06 22.4 18.4 27.6   0.5   NA   37  8.1   NA 1012.6   NA
## 155 2021-12-07 22.3 17.7 28.1   0.0   NA   41  6.5   NA 1014.6   NA
## 156 2021-12-08 22.2 16.7 28.7   0.0   NA   43  8.1   NA 1016.2   NA
## 157 2021-12-09 22.6 19.1 27.1   0.0   NA   35  6.9   NA 1017.7   NA
## 158 2021-12-10 21.7 18.3 27.8   0.0   NA   69  4.7   NA 1017.6   NA
## 159 2021-12-11 21.6 16.5 27.9   0.0   NA   40  5.8   NA 1017.3   NA
## 160 2021-12-12 21.2 17.4 27.1   0.0   NA   44  6.1   NA 1018.1   NA
## 161 2021-12-13 19.6 14.3 26.2   0.0   NA  346  4.4   NA 1017.0   NA
## 162 2021-12-14 19.0 13.4 26.0   0.0   NA   34  5.0   NA 1016.2   NA
## 163 2021-12-15 18.6 12.9 25.6   0.0   NA   29  6.5   NA 1016.2   NA
## 164 2021-12-16 18.2 12.9 25.1   0.0   NA   44  7.5   NA 1017.0   NA
## 165 2021-12-17 18.6 11.1 25.9   0.0   NA   42  5.5   NA 1017.1   NA
## 166 2021-12-18 19.6 12.9 27.4   0.0   NA   35  6.9   NA 1016.4   NA
## 167 2021-12-19 18.0 13.0 24.0   0.0   NA   13  7.5   NA 1017.6   NA
## 168 2021-12-20 14.6  8.2 21.5   0.0   NA   24  6.4   NA 1018.1   NA
## 169 2021-12-21 15.0  9.1 22.8   0.0   NA  338  5.6   NA 1016.8   NA
## 170 2021-12-22 16.2  9.7 24.5   0.0   NA   17  5.3   NA 1014.8   NA
## 171 2021-12-23 16.6  9.2 25.5   0.0   NA   13  5.7   NA 1014.6   NA
## 172 2021-12-24 17.8 10.4 26.6   0.0   NA  131  5.6   NA 1014.3   NA
## 173 2021-12-25 19.7 12.6 27.9   0.0   NA  138  6.1   NA 1014.2   NA
## 174 2021-12-26 20.2 14.1 27.6   0.0   NA   29  4.7   NA 1016.2   NA
## 175 2021-12-27 20.0 13.7 27.7   0.0   NA  137  5.7   NA 1017.2   NA
## 176 2021-12-28 20.6 15.6 27.2  11.7   NA  100  5.2   NA 1016.7   NA
## 177 2021-12-29 19.3 17.8 21.5  21.9   NA  104  6.2   NA 1017.1   NA
## 178 2021-12-30 19.3 16.3 24.1   3.7   NA   20  6.5   NA 1020.1   NA
## 179 2021-12-31 18.3 14.8 24.0   0.0   NA   42  6.9   NA 1020.6   NA
## 180 2022-01-01 17.6 12.7 23.1   0.0   NA   20  6.1   NA 1020.3   NA
## 181 2022-01-02 17.3 12.0 23.0   0.0   NA   23  5.8   NA 1018.8   NA
## 182 2022-01-03 16.7 11.6 23.8   0.0   NA   32  5.2   NA 1017.4   NA
## 183 2022-01-04 17.1 10.6 25.1   0.0   NA   24  4.7   NA 1017.3   NA
## 184 2022-01-05 18.5 12.9 25.5   0.0   NA   12  3.0   NA 1016.8   NA
## 185 2022-01-06 19.1 13.8 26.5   0.0   NA    4  4.1   NA 1016.0   NA
## 186 2022-01-07 19.7 14.0 27.1   0.0   NA   32  5.9   NA 1016.5   NA
## 187 2022-01-08 19.7 14.1 27.3   0.0   NA  163  4.7   NA 1016.5   NA
## 188 2022-01-09 20.0 15.1 27.7   0.0   NA   93  4.9   NA 1014.4   NA
## 189 2022-01-10 19.9 16.4 23.3   0.5   NA   19  4.9   NA 1015.7   NA
## 190 2022-01-11 19.4 16.9 22.0  15.1   NA   40  3.5   NA 1015.8   NA
## 191 2022-01-12 18.7 16.4 22.1   2.2   NA  154  4.3   NA 1015.3   NA
## 192 2022-01-13 18.9 14.8 24.7   0.0   NA   46  6.5   NA 1015.8   NA
## 193 2022-01-14 18.9 16.1 23.0   0.0   NA   44  8.0   NA 1015.7   NA
## 194 2022-01-15 18.8 15.5 23.8   0.5   NA   32  6.6   NA 1016.4   NA
## 195 2022-01-16 18.0 13.5 23.4   0.0   NA   35  6.5   NA 1018.3   NA
## 196 2022-01-17 17.1 11.4 23.4   0.0   NA   21  5.3   NA 1019.0   NA
## 197 2022-01-18 16.4 10.6 23.2   0.0   NA    5  5.4   NA 1016.7   NA
## 198 2022-01-19 16.6 10.2 23.3   0.0   NA    0  5.8   NA 1014.5   NA
## 199 2022-01-20 16.4 10.8 23.7   0.0   NA   27  4.7   NA 1012.0   NA
## 200 2022-01-21 18.1 11.1 26.0   0.2   NA   61  4.3   NA 1011.1   NA
## 201 2022-01-22 19.5 13.9 26.8   0.0   NA  110  4.4   NA 1009.7   NA
## 202 2022-01-23 19.6 16.7 23.9   4.8   NA  140  4.8   NA 1009.4   NA
## 203 2022-01-24 20.2 16.1 25.7   0.7   NA  316  4.6   NA 1010.6   NA
## 204 2022-01-25 18.5 14.4 23.6   0.0   NA   22  6.6   NA 1012.3   NA
## 205 2022-01-26 18.5 14.6 24.0   0.2   NA  349  7.8   NA 1013.3   NA
## 206 2022-01-27 16.3 11.7 22.2   0.0   NA  349  7.2   NA 1015.4   NA
## 207 2022-01-28 14.9  9.2 21.6   0.0   NA    4  6.3   NA 1016.7   NA
## 208 2022-01-29 16.0  8.4 23.0   0.0   NA    5  5.1   NA 1016.5   NA
## 209 2022-01-30 16.6  9.7 25.2   0.0   NA  204  4.5   NA 1013.2   NA
## 210 2022-01-31 18.1 11.1 27.4   0.0   NA  201  4.6   NA 1011.9   NA
## 211 2022-02-01 18.9 12.1 27.5   0.0   NA  129  4.6   NA 1010.5   NA
## 212 2022-02-02 20.2 12.7 28.8   0.0   NA   75  4.7   NA 1011.9   NA
## 213 2022-02-03 21.2 13.2 29.0   0.0   NA  137  9.6   NA 1010.6   NA
## 214 2022-02-04 20.4 15.1 25.6  22.1   NA  212 10.6   NA 1008.9   NA
## 215 2022-02-05 17.7 13.1 24.3   0.0   NA    0  6.3   NA 1012.7   NA
## 216 2022-02-06 17.3 10.3 25.5   0.0   NA  351  5.9   NA 1016.0   NA
## 217 2022-02-07 18.3 11.7 26.9   0.0   NA  342  5.2   NA 1015.2   NA
## 218 2022-02-08 18.9 11.4 27.5   0.0   NA   26  5.3   NA 1013.9   NA
## 219 2022-02-09 21.1 13.5 28.3   0.0   NA  154 10.1   NA 1013.5   NA
## 220 2022-02-10 21.2 17.2 26.1   0.7   NA  129  6.2   NA 1013.7   NA
## 221 2022-02-11 18.9 13.0 25.4   0.0   NA  338  7.4   NA 1014.8   NA
## 222 2022-02-12 17.9 11.3 25.8   0.0   NA   29  6.0   NA 1014.6   NA
## 223 2022-02-13 17.5  9.7 26.1   0.0   NA    9  5.4   NA 1013.6   NA
## 224 2022-02-14 18.4 10.6 27.5   0.0   NA    8  5.1   NA 1012.9   NA
## 225 2022-02-15 19.6 12.6 28.1   0.0   NA   53  5.5   NA 1012.3   NA
## 226 2022-02-16 21.1 13.4 28.9   0.0   NA   53  5.4   NA 1010.1   NA
## 227 2022-02-17 21.7 16.1 29.2   0.0   NA   71  5.8   NA 1010.1   NA
## 228 2022-02-18 21.7 14.9 29.6   0.0   NA   45  6.8   NA 1011.6   NA
## 229 2022-02-19 22.7 14.5 30.4   0.0   NA  321  5.8   NA 1012.1   NA
## 230 2022-02-20 24.4 18.6 31.2   0.0   NA  249  9.1   NA 1008.2   NA
## 231 2022-02-21 22.2 14.1 29.9   0.0   NA  187  8.1   NA 1009.0   NA
## 232 2022-02-22 21.0 13.0 29.6   0.0   NA   35  5.8   NA 1013.2   NA
## 233 2022-02-23 21.9 14.4 31.2   0.0   NA  143  6.2   NA 1015.7   NA
## 234 2022-02-24 24.4 18.2 32.5   0.0   NA  172  9.1   NA 1014.9   NA
## 235 2022-02-25 24.8 18.1 32.8   0.0   NA  175  8.0   NA 1014.3   NA
## 236 2022-02-26 24.4 17.8 32.1   0.0   NA   21  7.1   NA 1015.8   NA
## 237 2022-02-27 25.0 17.6 33.2   0.0   NA  102  6.1   NA 1015.0   NA
## 238 2022-02-28 24.6 17.8 32.3   0.0   NA   39  5.5   NA 1014.3   NA
## 239 2022-03-01 25.3 18.1 32.8   0.0   NA    8  6.4   NA 1014.6   NA
## 240 2022-03-02 25.3 17.9 33.0   0.0   NA   38  5.2   NA 1014.4   NA
## 241 2022-03-03 25.1 17.7 33.0   0.0   NA   67  4.9   NA 1014.3   NA
## 242 2022-03-04 25.1 17.5 33.6   0.0   NA  347  6.5   NA 1013.4   NA
## 243 2022-03-05 25.6 18.4 33.6   0.0   NA  209  7.6   NA 1011.8   NA
## 244 2022-03-06 25.1 16.9 33.0   0.0   NA   29  6.9   NA 1012.5   NA
## 245 2022-03-07 25.3 17.5 33.5   0.0   NA   20  5.2   NA 1012.9   NA
## 246 2022-03-08 25.5 18.4 33.6   0.0   NA    6  6.1   NA 1011.8   NA
## 247 2022-03-09 25.3 16.9 33.9   0.0   NA   32  6.4   NA 1011.6   NA
## 248 2022-03-10 26.1 17.5 35.1   0.0   NA   40  8.1   NA 1011.1   NA
## 249 2022-03-11 26.6 18.1 34.7   0.0   NA    7  6.5   NA 1010.8   NA
## 250 2022-03-12 26.8 19.8 34.7   0.0   NA  331  7.0   NA 1010.3   NA
## 251 2022-03-13 25.8 18.2 34.3   0.0   NA   17  6.0   NA 1010.7   NA
## 252 2022-03-14 26.2 17.6 35.1   0.0   NA  332  6.3   NA 1010.3   NA
## 253 2022-03-15 27.5 18.5 36.4   0.0   NA  273  4.7   NA 1007.5   NA
## 254 2022-03-16 28.1 21.0 36.6   0.0   NA  134  5.7   NA 1005.5   NA
## 255 2022-03-17 29.3 21.0 37.9   0.0   NA  159  6.6   NA 1005.7   NA
## 256 2022-03-18 29.6 21.5 37.8   0.0   NA  161  7.1   NA 1007.0   NA
## 257 2022-03-19 30.1 24.2 37.0   0.0   NA  189  6.8   NA 1005.7   NA
## 258 2022-03-20 29.9 22.1 38.1   0.0   NA  185  6.8   NA 1004.6   NA
## 259 2022-03-21 30.4 24.0 37.7   0.0   NA  174  7.5   NA 1005.7   NA
## 260 2022-03-22 30.3 23.8 38.3   0.0   NA  168  7.6   NA 1006.0   NA
## 261 2022-03-23 29.4 21.7 38.0   0.0   NA  134  5.7   NA 1006.6   NA
## 262 2022-03-24 29.4 20.4 38.5   0.0   NA  195  5.8   NA 1006.4   NA
## 263 2022-03-25 30.7 23.9 38.2   0.0   NA  202  9.9   NA 1006.6   NA
## 264 2022-03-26 32.0 26.4 38.6   0.0   NA  212  8.0   NA 1007.6   NA
## 265 2022-03-27 30.1 21.3 38.4   0.0   NA  312  6.9   NA 1009.1   NA
## 266 2022-03-28 29.4 20.8 38.3   0.0   NA  178  7.2   NA 1007.9   NA
## 267 2022-03-29 30.1 19.9 39.9   0.0   NA  180  7.7   NA 1005.9   NA
## 268 2022-03-30 30.2 21.5 40.0   0.0   NA  167  7.1   NA 1005.7   NA
## 269 2022-03-31 31.2 22.5 40.4   0.0   NA  175  9.0   NA 1006.5   NA
## 270 2022-04-01 31.8 24.6 40.6   0.0   NA  181 11.1   NA 1006.3   NA
## 271 2022-04-02 32.0 24.8 40.3   0.0   NA  170  7.0   NA 1007.5   NA
## 272 2022-04-03 31.5 24.7 40.2   0.0   NA  177  7.1   NA 1007.6   NA
## 273 2022-04-04 31.8 25.1 40.6   0.0   NA  188 13.1   NA 1008.1   NA
## 274 2022-04-05 31.8 24.6 40.5   0.0   NA  187 10.6   NA 1010.2   NA
## 275 2022-04-06 31.7 25.0 40.1   0.0   NA  184  8.2   NA 1010.4   NA
## 276 2022-04-07 31.9 24.2 39.9   0.0   NA  182  7.8   NA 1009.5   NA
## 277 2022-04-08 32.6 25.6 40.5   0.0   NA  181 10.7   NA 1007.5   NA
## 278 2022-04-09 33.0 25.8 40.6   0.0   NA  190 11.8   NA 1006.1   NA
## 279 2022-04-10 33.5 25.3 41.4   0.0   NA  195 13.3   NA 1004.7   NA
## 280 2022-04-11 33.5 26.2 42.2   0.0   NA  192 10.6   NA 1003.9   NA
## 281 2022-04-12 33.6 24.5 41.6   0.0   NA  176  8.8   NA 1004.0   NA
## 282 2022-04-13 33.9 26.2 41.8   0.0   NA  197 11.3   NA 1003.1   NA
## 283 2022-04-14 34.6 28.9 41.4   0.0   NA  199 11.2   NA 1002.8   NA
## 284 2022-04-15 34.0 26.7 41.1   0.0   NA  191 11.4   NA 1002.8   NA
## 285 2022-04-16 33.2 25.5 41.1   0.0   NA  185  8.7   NA 1002.6   NA
## 286 2022-04-17 33.1 25.8 41.2   0.0   NA  194  7.3   NA 1003.3   NA
## 287 2022-04-18 33.4 25.0 42.1   0.0   NA  206  7.1   NA 1005.1   NA
## 288 2022-04-19 34.0 24.8 42.6   0.0   NA  189 10.0   NA 1005.8   NA
## 289 2022-04-20 34.0 26.2 42.2   0.0   NA  171  9.5   NA 1004.7   NA
## 290 2022-04-21 31.7 26.4 39.8   0.7   NA  161  8.9   NA 1005.2   NA
## 291 2022-04-22 32.6 26.8 39.7   0.0   NA  172  7.3   NA 1006.6   NA
## 292 2022-04-23 33.4 25.9 40.7   0.0   NA  193  7.1   NA 1006.2   NA
## 293 2022-04-24 34.1 27.0 41.7   0.0   NA  189  8.3   NA 1004.4   NA
## 294 2022-04-25 33.3 23.8 41.6   0.0   NA  181  5.8   NA 1004.1   NA
## 295 2022-04-26 33.3 24.4 41.9   0.0   NA  179  7.1   NA 1004.6   NA
## 296 2022-04-27 33.7 26.1 42.1   0.0   NA  184  8.4   NA 1004.1   NA
## 297 2022-04-28 34.3 26.0 42.9   0.0   NA  179  7.3   NA 1004.6   NA
## 298 2022-04-29 34.4 26.4 42.7   0.0   NA  182  8.2   NA 1004.2   NA
## 299 2022-04-30 35.0 27.0 43.6   0.0   NA  190 13.0   NA 1002.1   NA
## 300 2022-05-01 32.1 26.4 41.2   0.0   NA  146 10.0   NA 1002.0   NA
## 301 2022-05-02 31.5 27.4 40.3   0.0   NA  165 10.6   NA 1002.9   NA
## 302 2022-05-03 32.5 26.8 41.3   0.0   NA  173 13.6   NA 1003.9   NA
## 303 2022-05-04 30.4 25.6 37.2   1.5   NA  146  6.5   NA 1005.4   NA
## 304 2022-05-05 30.5 25.3 36.4   0.8   NA  135  6.3   NA 1005.4   NA
## 305 2022-05-06 31.4 25.3 38.0   0.0   NA  123  6.3   NA 1005.2   NA
## 306 2022-05-07 31.9 25.8 38.8   0.0   NA  157  6.4   NA 1004.6   NA
## 307 2022-05-08 31.9 25.6 38.2   0.0   NA  142  6.4   NA 1003.8   NA
## 308 2022-05-09 31.3 24.8 37.7   0.0   NA  108  5.9   NA 1003.6   NA
## 309 2022-05-10 30.5 26.8 37.1   5.6   NA  110  5.7   NA 1003.6   NA
## 310 2022-05-11 29.6 26.4 34.6   3.1   NA  159  9.1   NA 1003.6   NA
## 311 2022-05-12 31.2 26.8 36.6   0.3   NA  183 16.5   NA 1002.6   NA
## 312 2022-05-13 33.3 27.2 39.9   0.0   NA  199 14.8   NA 1000.2   NA
## 313 2022-05-14 34.2 27.8 42.0   0.0   NA  192 17.1   NA 1000.1   NA
## 314 2022-05-15 32.9 28.1 41.9   0.0   NA  183 16.7   NA 1001.9   NA
## 315 2022-05-16 33.7 27.8 41.5   0.0   NA  191 13.4   NA 1001.7   NA
## 316 2022-05-17 33.1 28.2 39.6   0.0   NA  188 14.4   NA 1001.5   NA
## 317 2022-05-18 32.3 28.1 38.3   0.0   NA  179 13.7   NA 1002.9   NA
## 318 2022-05-19 31.8 27.9 37.8   0.3   NA  188 11.2   NA 1003.9   NA
## 319 2022-05-20 33.4 27.4 39.9   0.0   NA  203 11.9   NA 1001.2   NA
## 320 2022-05-21 32.5 28.1 41.5   0.1   NA  190 12.8   NA  998.5   NA
## 321 2022-05-22 31.0 25.8 38.9   3.3   NA  172 10.1   NA  997.5   NA
## 322 2022-05-23 30.8 25.5 36.3   0.9   NA  193  6.6   NA  999.0   NA
## 323 2022-05-24 30.2 25.4 36.7  11.5   NA  157  6.4   NA 1003.2   NA
## 324 2022-05-25 29.8 24.7 36.0   0.0   NA  181  5.5   NA 1004.6   NA
## 325 2022-05-26   NA   NA   NA    NA   NA   NA   NA   NA     NA   NA
## 326 2022-05-27   NA   NA   NA    NA   NA   NA   NA   NA     NA   NA
## 327 2022-05-28 31.0 25.4 37.2   0.0   NA  220  7.6   NA 1000.6   NA
## 328 2022-05-29 32.1 26.3 39.2   0.0   NA  182  5.5   NA 1000.2   NA
## 329 2022-05-30 32.5 28.1 38.0   0.0   NA  223  6.7   NA 1000.2   NA
## 330 2022-05-31 33.3 26.9 39.8   0.0   NA  182  6.1   NA  999.0   NA
## 331 2022-06-01 32.3 26.9 39.7   0.0   NA  152  6.3   NA  999.6   NA
## 332 2022-06-02 33.0 27.3 40.9   0.0   NA  161  7.1   NA  999.5   NA
## 333 2022-06-03 33.0 27.7 39.9   0.0   NA  183  9.5   NA  999.4   NA
## 334 2022-06-04 32.0 28.0 39.5   2.5   NA  171 10.7   NA 1000.3   NA
## 335 2022-06-05 30.9 27.0 39.3   4.2   NA  168 10.8   NA 1000.9   NA
## 336 2022-06-06 30.8 26.3 37.3   0.0   NA  166  9.8   NA 1000.6   NA
## 337 2022-06-07 32.4 27.5 39.9   0.0   NA  186 12.0   NA  999.4   NA
## 338 2022-06-08 33.0 27.1 40.7   0.0   NA  179  9.1   NA  998.4   NA
## 339 2022-06-09 32.3 28.4 38.6   6.5   NA  175 11.7   NA  998.5   NA
## 340 2022-06-10 33.2 29.3 39.1   0.3   NA  194  9.8   NA  999.6   NA
## 341 2022-06-11 33.2 28.2 39.8   0.0   NA  205 12.1   NA 1000.5   NA
## 342 2022-06-12 33.4 28.0 40.2   0.0   NA  201 10.7   NA 1001.4   NA
## 343 2022-06-13 32.4 28.5 38.4   0.0   NA  193  8.5   NA 1003.0   NA
## 344 2022-06-14 33.2 28.4 39.9   0.7   NA  193  9.5   NA 1002.1   NA
## 345 2022-06-15 30.2 27.6 34.2   6.2   NA  215  8.7   NA 1003.3   NA
## 346 2022-06-16 30.4 27.0 36.3   4.0   NA  201 10.8   NA 1003.8   NA
## 347 2022-06-17 28.5 26.8 33.5  15.5   NA  189 11.1   NA 1004.3   NA
## 348 2022-06-18 28.2 26.3 32.2  22.3   NA  200  8.9   NA 1004.7   NA
## 349 2022-06-19 27.7 26.3 31.9  11.7   NA  201  9.0   NA 1003.9   NA
## 350 2022-06-20 28.1 26.3 31.6  16.3   NA  200  8.7   NA 1002.1   NA
## 351 2022-06-21 27.8 25.3 32.1   3.2   NA  184  7.0   NA 1000.8   NA
## 352 2022-06-22 28.0 24.6 33.2   1.0   NA  171  3.4   NA 1002.9   NA
## 353 2022-06-23 28.5 25.8 31.7   7.1   NA  196  6.3   NA 1003.8   NA
## 354 2022-06-24 27.5 25.8 30.6  33.1   NA  184  5.8   NA 1002.3   NA
## 355 2022-06-25 27.8 25.7 31.0   2.6   NA  188  5.1   NA 1000.4   NA
## 356 2022-06-26 28.6 25.3 32.9   2.7   NA  159  7.2   NA 1002.0   NA
## 357 2022-06-27 29.8 26.1 33.9   0.1   NA  173 10.4   NA 1003.5   NA
## 358 2022-06-28 30.0 26.4 34.0   0.2   NA  185 10.2   NA 1002.1   NA
## 359 2022-06-29 29.5 26.5 33.7   5.3   NA  201  6.4   NA 1000.7   NA
## 360 2022-06-30 27.9 25.9 31.3  23.5   NA  187  5.5   NA 1000.5   NA
## 361 2022-07-01 28.4 25.9 32.0  22.2   NA  220  5.2   NA 1000.2   NA
## 362 2022-07-02 28.3 26.1 32.4  26.1   NA  219  5.2   NA  999.1   NA
## 363 2022-07-03 27.5 26.1 31.1  18.9   NA  186  5.2   NA  999.0   NA
## 364 2022-07-04 28.4 25.5 32.5   4.3   NA  153  5.4   NA  998.7   NA
## 365 2022-07-05 28.3 26.0 32.2  23.1   NA  156  5.1   NA  998.9   NA
## 366 2022-07-06 27.8 25.9 30.8   9.5   NA  122  6.7   NA 1000.6   NA
## 367 2022-07-07 29.3 25.8 34.4   1.5   NA  125  5.3   NA 1002.6   NA
## 368 2022-07-08 28.9 25.9 34.3   3.4   NA   90  7.3   NA 1000.3   NA
## 369 2022-07-09 29.0 26.1 33.1   6.4   NA   66  9.8   NA  997.9   NA
## 370 2022-07-10 29.0 26.3 33.3   2.8   NA   60  9.7   NA  997.3   NA
## 371 2022-07-11 29.0 26.1 33.1   2.6   NA   52 12.2   NA  997.9   NA
## 372 2022-07-12 29.1 26.8 32.2   1.8   NA   49 16.1   NA  996.4   NA
## 373 2022-07-13 29.2 26.8 32.8   4.7   NA   59 13.8   NA  994.9   NA
## 374 2022-07-14 28.0 26.5 30.5  27.6   NA   75 10.8   NA  995.3   NA
## 375 2022-07-15 27.7 25.8 31.6  24.2   NA   64  6.5   NA  998.4   NA
## 376 2022-07-16 27.9 25.7 31.9  15.0   NA   45  5.3   NA  999.6   NA
## 377 2022-07-17 28.7 26.4 32.0  11.4   NA   93 11.5   NA  999.8   NA
## 378 2022-07-18 28.6 25.6 32.5   2.1   NA  162 10.0   NA 1001.3   NA
## 379 2022-07-19 29.1 26.3 31.9   1.5   NA  218 10.9   NA 1002.0   NA
## 380 2022-07-20 29.2 26.0 32.8  10.1   NA  231  8.4   NA 1001.9   NA
## 381 2022-07-21 28.6 26.0 32.5   7.0   NA  217  5.2   NA 1001.9   NA
## 382 2022-07-22 27.8 26.2 31.8  26.6   NA  140  3.4   NA 1002.4   NA
## 383 2022-07-23 27.6 25.7 30.7  14.2   NA  120  3.2   NA 1001.9   NA
## 384 2022-07-24 27.2 25.5 29.8  20.5   NA  148  5.8   NA 1002.6   NA
## 385 2022-07-25 27.6 25.0 31.0  12.1   NA  172  8.9   NA 1005.4   NA
## 386 2022-07-26 28.4 25.5 32.2   3.2   NA  174 10.0   NA 1007.2   NA
## 387 2022-07-27 28.6 25.7 32.1   1.0   NA  180  8.1   NA 1007.2   NA
## 388 2022-07-28 28.9 26.0 31.8   5.5   NA  197  5.3   NA 1007.4   NA
## 389 2022-07-29 28.6 25.8 32.5  10.6   NA  212  5.8   NA 1006.4   NA
## 390 2022-07-30 28.9 26.2 34.0   4.6   NA  200  6.2   NA 1004.8   NA
## 391 2022-07-31 28.6 25.9 32.8   1.8   NA  204  5.1   NA 1003.0   NA
## 392 2022-08-01 28.8 26.1 33.0   7.3   NA  206  6.0   NA 1002.8   NA
## 393 2022-08-02 28.9 25.7 33.4   6.8   NA  208  4.6   NA 1002.4   NA
## 394 2022-08-03 28.5 26.5 31.3  12.2   NA  136  4.0   NA 1003.0   NA
## 395 2022-08-04 28.2 26.0 31.6  12.4   NA   81  3.8   NA 1003.1   NA
## 396 2022-08-05 28.8 25.6 32.5   3.4   NA  107  5.5   NA 1001.3   NA
## 397 2022-08-06 28.9 26.5 32.8  13.9   NA  359  4.4   NA  999.0   NA
## 398 2022-08-07 28.5 26.3 31.9   6.0   NA   48 11.5   NA  997.2   NA
## 399 2022-08-08 29.0 26.4 33.6   6.8   NA   57 13.5   NA  995.4   NA
## 400 2022-08-09 28.0 26.7 31.1   8.0   NA   62 16.4   NA  993.1   NA
## 401 2022-08-10 26.5 25.9 27.8  14.4   NA  153 11.6   NA  994.6   NA
## 402 2022-08-11 26.3 25.1 28.0  30.5   NA  178 10.0   NA  999.4   NA
## 403 2022-08-12 26.4 24.8 30.6  46.6   NA  212  7.8   NA 1001.0   NA
## 404 2022-08-13 27.3 25.2 31.1   8.0   NA  278  7.0   NA  999.4   NA
## 405 2022-08-14 26.8 25.1 29.1  88.4   NA   19 11.2   NA  996.5   NA
## 406 2022-08-15 27.3 25.6 29.7  28.3   NA  170 18.5   NA 1001.0   NA
## 407 2022-08-16 28.6 24.6 33.3   1.6   NA  208  7.8   NA 1005.7   NA
## 408 2022-08-17 29.0 24.8 33.3   0.1   NA  236  6.8   NA 1004.9   NA
## 409 2022-08-18 28.3 25.9 33.1  20.2   NA  279  6.3   NA 1002.5   NA
## 410 2022-08-19 27.0 25.4 29.9  26.9   NA  296  8.0   NA 1001.0   NA
## 411 2022-08-20 25.8 25.2 26.4 123.0   NA  220 19.8   NA  996.8   NA
## 412 2022-08-21 27.1 25.1 31.1  20.0   NA  172 13.8   NA  999.7   NA
## 413 2022-08-22 28.4 25.1 33.1   0.4   NA  180  9.2   NA 1001.2   NA
## 414 2022-08-23 28.7 24.9 33.3   3.6   NA  318  6.2   NA 1003.7   NA
## 415 2022-08-24 26.7 25.3 30.0  17.9   NA  248  4.2   NA 1005.3   NA
## 416 2022-08-25 28.4 24.9 32.5   3.2   NA  210  6.2   NA 1003.9   NA
## 417 2022-08-26 29.4 25.3 33.8   0.0   NA  219  6.1   NA 1003.2   NA
## 418 2022-08-27 28.7 26.1 33.7   2.7   NA  193  7.7   NA 1003.3   NA
## 419 2022-08-28 28.3 25.8 32.0   1.8   NA  177  6.1   NA 1004.7   NA
## 420 2022-08-29 28.9 25.7 33.2   1.9   NA  166  5.1   NA 1007.4   NA
## 421 2022-08-30 29.8 26.4 34.3   0.0   NA  174  7.6   NA 1007.9   NA
## 422 2022-08-31 29.0 26.6 33.5   2.0   NA  187  8.6   NA 1006.8   NA
## 423 2022-09-01 29.1 25.7 33.2  11.5   NA  205  6.7   NA 1007.2   NA
## 424 2022-09-02 29.4 26.4 33.7   1.5   NA  189  7.0   NA 1007.5   NA
## 425 2022-09-03 28.7 26.6 32.6   8.0   NA  203  8.0   NA 1005.8   NA
## 426 2022-09-04 28.2 25.9 31.8  17.7   NA  211  6.8   NA 1004.8   NA
miss_scan_count(data = Weather_Rourkela, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 11 × 2
##    Variable     n
##    <chr>    <int>
##  1 time         0
##  2 tavg         0
##  3 tmin         0
##  4 tmax         0
##  5 prcp         0
##  6 snow         0
##  7 wdir         0
##  8 wspd         0
##  9 wpgt         0
## 10 pres         0
## 11 tsun         0
##Create shadow matrix data
as_shadow(Weather_Rourkela)
## # A tibble: 426 × 11
##    time_NA tavg_NA tmin_NA tmax_NA prcp_NA snow_NA wdir_NA wspd_NA wpgt_NA
##    <fct>   <fct>   <fct>   <fct>   <fct>   <fct>   <fct>   <fct>   <fct>  
##  1 !NA     !NA     !NA     !NA     NA      NA      !NA     !NA     NA     
##  2 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
##  3 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
##  4 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
##  5 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
##  6 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
##  7 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
##  8 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
##  9 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
## 10 !NA     !NA     !NA     !NA     !NA     NA      !NA     !NA     NA     
## # ℹ 416 more rows
## # ℹ 2 more variables: pres_NA <fct>, tsun_NA <fct>
#Create nabular data by binding the shadow to the data
bind_shadow(Weather_Rourkela, only_miss = TRUE)
## # A tibble: 426 × 21
##    time       tavg  tmin  tmax  prcp snow   wdir  wspd wpgt   pres tsun  tavg_NA
##    <chr>     <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl> <lgl> <dbl> <lgl> <fct>  
##  1 2021-07-…  29.3  26.2  32.6  NA   NA      197   6.8 NA    1002. NA    !NA    
##  2 2021-07-…  29.7  27.3  33.4  11.1 NA      199   6.9 NA    1002. NA    !NA    
##  3 2021-07-…  27.4  25.8  29.7  66.9 NA      186   6.3 NA    1002. NA    !NA    
##  4 2021-07-…  28.5  26.1  32.1  11.4 NA      173   3.9 NA    1001  NA    !NA    
##  5 2021-07-…  29    26.2  32.6   2.7 NA      121   4.6 NA    1001. NA    !NA    
##  6 2021-07-…  29.3  26.2  33.7  10.8 NA       70   5.8 NA    1002. NA    !NA    
##  7 2021-07-…  28.9  25.7  32.9   5.4 NA       95   7   NA    1003. NA    !NA    
##  8 2021-07-…  28.6  25.5  32.5  10.1 NA      101   5.5 NA    1003. NA    !NA    
##  9 2021-07-…  29    25.4  32.7   1.9 NA      138   6.5 NA    1003. NA    !NA    
## 10 2021-07-…  29.5  25.5  33.4   1.3 NA      152   8.7 NA    1004  NA    !NA    
## # ℹ 416 more rows
## # ℹ 9 more variables: tmin_NA <fct>, tmax_NA <fct>, prcp_NA <fct>,
## #   snow_NA <fct>, wdir_NA <fct>, wspd_NA <fct>, wpgt_NA <fct>, pres_NA <fct>,
## #   tsun_NA <fct>
# Lets explore the relations ship with the missing values
Weather_Rourkela %>%
bind_shadow(only_miss = TRUE) %>%
group_by(prcp_NA) %>%
summarise(tavg_mean = mean(tavg),tavg_sd = sd(tavg))
## # A tibble: 2 × 3
##   prcp_NA tavg_mean tavg_sd
##   <fct>       <dbl>   <dbl>
## 1 !NA          26.7    4.61
## 2 NA           NA     NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(Weather_Rourkela) %>%
ggplot(aes(x = tavg,
color = prcp_NA)) +
geom_density() +
facet_wrap(~tmin_NA)
## Warning: Removed 2 rows containing non-finite values (`stat_density()`).
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(Weather_Rourkela, aes(x = tavg,y = prcp)) + geom_miss_point()

ggplot(Weather_Rourkela, aes(x = tavg,y = prcp)) + geom_miss_point() +
facet_wrap(~year(dmy(time)))
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.

## Warning: All formats failed to parse. No formats found.

# Looks like there are not too much of missing data

##Analysing AQI stations: stations.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_stations)
## [1] 230   5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_stations)
## Rows: 230
## Columns: 5
## $ StationId   <chr> "AP001", "AP002", "AP003", "AP004", "AP005", "AS001", "BR0…
## $ StationName <chr> "Secretariat, Amaravati - APPCB", "Anand Kala Kshetram, Ra…
## $ City        <chr> "Amaravati", "Rajamahendravaram", "Tirupati", "Vijayawada"…
## $ State       <chr> "Andhra Pradesh", "Andhra Pradesh", "Andhra Pradesh", "And…
## $ Status      <chr> "Active", "", "", "", "Active", "Active", "", "", "", "", …
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_stations)
## [1] "StationId"   "StationName" "City"        "State"       "Status"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_stations)
## 'data.frame':    230 obs. of  5 variables:
##  $ StationId  : chr  "AP001" "AP002" "AP003" "AP004" ...
##  $ StationName: chr  "Secretariat, Amaravati - APPCB" "Anand Kala Kshetram, Rajamahendravaram - APPCB" "Tirumala, Tirupati - APPCB" "PWD Grounds, Vijayawada - APPCB" ...
##  $ City       : chr  "Amaravati" "Rajamahendravaram" "Tirupati" "Vijayawada" ...
##  $ State      : chr  "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
##  $ Status     : chr  "Active" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_stations)
##   StationId         StationName            City              State          
##  Length:230         Length:230         Length:230         Length:230        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##     Status         
##  Length:230        
##  Class :character  
##  Mode  :character
attach(AQ_stations)

AQ_stations [AQ_stations == ""] <- NA
## There is no records with NA but there are records with missing data.
## Lets fill them with NA and then find it.
AQ_stations[is.na(Status),]
## [1] StationId   StationName City        State       Status     
## <0 rows> (or 0-length row.names)

##Analysing and Performing Imputations on AQI Station Hour wise - station_hour.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_hour)
## [1] 2589083      16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_hour)
## Rows: 2,589,083
## Columns: 16
## $ StationId  <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Datetime   <chr> "2017-11-24 17:00:00", "2017-11-24 18:00:00", "2017-11-24 1…
## $ PM2.5      <dbl> 60.50, 65.50, 80.00, 81.50, 75.25, 69.25, 67.50, 68.00, 73.…
## $ PM10       <dbl> 98.00, 111.25, 132.00, 133.25, 116.00, 108.25, 111.50, 111.…
## $ NO         <dbl> 2.35, 2.70, 2.10, 1.95, 1.43, 0.70, 1.05, 1.25, 0.30, 0.80,…
## $ NO2        <dbl> 30.80, 24.20, 25.18, 16.25, 17.48, 18.47, 12.15, 14.12, 14.…
## $ NOx        <dbl> 18.25, 15.07, 15.15, 10.23, 10.43, 10.38, 7.30, 8.50, 7.90,…
## $ NH3        <dbl> 8.50, 9.77, 12.02, 11.58, 12.03, 13.80, 17.65, 20.28, 11.50…
## $ CO         <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.1, 0.1,…
## $ SO2        <dbl> 11.85, 13.17, 12.08, 10.47, 9.12, 9.25, 9.40, 8.90, 11.80, …
## $ O3         <dbl> 126.40, 117.12, 98.98, 112.20, 106.35, 91.10, 112.70, 116.1…
## $ Benzene    <dbl> 0.10, 0.10, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.23,…
## $ Toluene    <dbl> 6.10, 6.25, 5.98, 6.72, 5.75, 5.02, 5.60, 5.55, 6.60, 6.77,…
## $ Xylene     <dbl> 0.10, 0.15, 0.18, 0.10, 0.08, 0.00, 0.10, 0.05, 0.00, 0.10,…
## $ AQI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_hour)
##  [1] "StationId"  "Datetime"   "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_hour)
## 'data.frame':    2589083 obs. of  16 variables:
##  $ StationId : chr  "AP001" "AP001" "AP001" "AP001" ...
##  $ Datetime  : chr  "2017-11-24 17:00:00" "2017-11-24 18:00:00" "2017-11-24 19:00:00" "2017-11-24 20:00:00" ...
##  $ PM2.5     : num  60.5 65.5 80 81.5 75.2 ...
##  $ PM10      : num  98 111 132 133 116 ...
##  $ NO        : num  2.35 2.7 2.1 1.95 1.43 0.7 1.05 1.25 0.3 0.8 ...
##  $ NO2       : num  30.8 24.2 25.2 16.2 17.5 ...
##  $ NOx       : num  18.2 15.1 15.2 10.2 10.4 ...
##  $ NH3       : num  8.5 9.77 12.02 11.58 12.03 ...
##  $ CO        : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.3 0.1 ...
##  $ SO2       : num  11.85 13.17 12.08 10.47 9.12 ...
##  $ O3        : num  126 117 99 112 106 ...
##  $ Benzene   : num  0.1 0.1 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.23 ...
##  $ Toluene   : num  6.1 6.25 5.98 6.72 5.75 5.02 5.6 5.55 6.6 6.77 ...
##  $ Xylene    : num  0.1 0.15 0.18 0.1 0.08 0 0.1 0.05 0 0.1 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_hour)
##   StationId           Datetime             PM2.5             PM10        
##  Length:2589083     Length:2589083     Min.   :   0.0   Min.   :   0.0   
##  Class :character   Class :character   1st Qu.:  28.2   1st Qu.:  64.0   
##  Mode  :character   Mode  :character   Median :  52.6   Median : 116.2   
##                                        Mean   :  80.9   Mean   : 158.5   
##                                        3rd Qu.:  97.7   3rd Qu.: 204.0   
##                                        Max.   :1000.0   Max.   :1000.0   
##                                        NA's   :647689   NA's   :1119252  
##        NO              NO2              NOx              NH3         
##  Min.   :  0.0    Min.   :  0.0    Min.   :  0.0    Min.   :  0.0    
##  1st Qu.:  3.0    1st Qu.: 13.1    1st Qu.: 11.3    1st Qu.: 11.2    
##  Median :  7.2    Median : 24.8    Median : 22.9    Median : 22.4    
##  Mean   : 22.8    Mean   : 35.2    Mean   : 40.6    Mean   : 28.7    
##  3rd Qu.: 18.6    3rd Qu.: 45.5    3rd Qu.: 45.7    3rd Qu.: 37.8    
##  Max.   :500.0    Max.   :500.0    Max.   :500.0    Max.   :500.0    
##  NA's   :553711   NA's   :528973   NA's   :490808   NA's   :1236618  
##        CO              SO2               O3            Benzene      
##  Min.   :  0.0    Min.   :  0.0    Min.   :  0.0    Min.   :  0.0   
##  1st Qu.:  0.4    1st Qu.:  4.2    1st Qu.: 11.0    1st Qu.:  0.1   
##  Median :  0.8    Median :  8.2    Median : 24.8    Median :  1.0   
##  Mean   :  1.5    Mean   : 12.1    Mean   : 38.1    Mean   :  3.3   
##  3rd Qu.:  1.4    3rd Qu.: 14.5    3rd Qu.: 49.5    3rd Qu.:  3.2   
##  Max.   :498.6    Max.   :200.0    Max.   :997.0    Max.   :498.1   
##  NA's   :499302   NA's   :742737   NA's   :725973   NA's   :861579  
##     Toluene            Xylene             AQI          AQI_Bucket       
##  Min.   :  0.0     Min.   :  0.0     Min.   :   5.0   Length:2589083    
##  1st Qu.:  0.3     1st Qu.:  0.0     1st Qu.:  84.0   Class :character  
##  Median :  3.4     Median :  0.2     Median : 131.0   Mode  :character  
##  Mean   : 14.9     Mean   :  2.4     Mean   : 180.2                     
##  3rd Qu.: 15.1     3rd Qu.:  1.8     3rd Qu.: 259.0                     
##  Max.   :500.0     Max.   :500.0     Max.   :3133.0                     
##  NA's   :1042366   NA's   :2075104   NA's   :570190
attach(AQ_station_hour)
## The following object is masked from AQ_stations:
## 
##     StationId
AQ_station_hour [AQ_station_hour == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5:647689  PM10:1119252 NO:553711       NO2:528973      NOx:490808  CO:1236618 
## SO2:499302    O3:742737    Benzene:725973  Toluene:861579  Xylene:1042366

AQ_station_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket        n
##   <chr>         <int>
## 1 Good         152113
## 2 Moderate     675008
## 3 Poor         239990
## 4 Satisfactory 530164
## 5 Severe       120468
## 6 Very Poor    301150
## 7 <NA>         570190
## Looks like Moderate entries are the highest ones but second highest is NA entries...
## Lets analyse the missing data of the dataset
n_miss(AQ_station_hour) ## Total number of missing parameters
## [1] 11664492
miss_var_summary(AQ_station_hour) ## Missingness summary
## # A tibble: 16 × 3
##    variable    n_miss pct_miss
##    <chr>        <int>    <dbl>
##  1 Xylene     2075104     80.1
##  2 NH3        1236618     47.8
##  3 PM10       1119252     43.2
##  4 Toluene    1042366     40.3
##  5 Benzene     861579     33.3
##  6 SO2         742737     28.7
##  7 O3          725973     28.0
##  8 PM2.5       647689     25.0
##  9 AQI         570190     22.0
## 10 AQI_Bucket  570190     22.0
## 11 NO          553711     21.4
## 12 NO2         528973     20.4
## 13 CO          499302     19.3
## 14 NOx         490808     19.0
## 15 StationId        0      0  
## 16 Datetime         0      0
miss_var_span(AQ_station_hour, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 10,357 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     43        207     0.172         0.828       250
##  2            2      0        250     0             1           250
##  3            3      0        250     0             1           250
##  4            4      0        250     0             1           250
##  5            5      0        250     0             1           250
##  6            6      0        250     0             1           250
##  7            7      0        250     0             1           250
##  8            8      0        250     0             1           250
##  9            9      0        250     0             1           250
## 10           10      8        242     0.032         0.968       250
## # ℹ 10,347 more rows
miss_var_table(AQ_station_hour)
## # A tibble: 14 × 3
##    n_miss_in_var n_vars pct_vars
##            <int>  <int>    <dbl>
##  1             0      2    12.5 
##  2        490808      1     6.25
##  3        499302      1     6.25
##  4        528973      1     6.25
##  5        553711      1     6.25
##  6        570190      2    12.5 
##  7        647689      1     6.25
##  8        725973      1     6.25
##  9        742737      1     6.25
## 10        861579      1     6.25
## 11       1042366      1     6.25
## 12       1119252      1     6.25
## 13       1236618      1     6.25
## 14       2075104      1     6.25
## vis_miss(AQ_station_hour) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_station_hour) ## plot for missing data

gg_miss_fct(x = AQ_station_hour, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).

gg_miss_span(AQ_station_hour, var = AQI, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_station_hour, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
##    Variable         n
##    <chr>        <int>
##  1 StationId        0
##  2 Datetime   2589083
##  3 PM2.5            0
##  4 PM10             0
##  5 NO               0
##  6 NO2              0
##  7 NOx              0
##  8 NH3              0
##  9 CO               0
## 10 SO2              0
## 11 O3               0
## 12 Benzene          0
## 13 Toluene          0
## 14 Xylene           0
## 15 AQI              0
## 16 AQI_Bucket  301150
##Create shadow matrix data
head(as_shadow(AQ_station_hour))
## # A tibble: 6 × 16
##   StationId_NA Datetime_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA
##   <fct>        <fct>       <fct>    <fct>   <fct> <fct>  <fct>  <fct>  <fct>
## 1 !NA          !NA         !NA      !NA     !NA   !NA    !NA    !NA    !NA  
## 2 !NA          !NA         !NA      !NA     !NA   !NA    !NA    !NA    !NA  
## 3 !NA          !NA         !NA      !NA     !NA   !NA    !NA    !NA    !NA  
## 4 !NA          !NA         !NA      !NA     !NA   !NA    !NA    !NA    !NA  
## 5 !NA          !NA         !NA      !NA     !NA   !NA    !NA    !NA    !NA  
## 6 !NA          !NA         !NA      !NA     !NA   !NA    !NA    !NA    !NA  
## # ℹ 7 more variables: SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>,
## #   Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_station_hour, only_miss = TRUE))
## # A tibble: 6 × 30
##   StationId Datetime       PM2.5  PM10    NO   NO2   NOx   NH3    CO   SO2    O3
##   <chr>     <chr>          <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001     2017-11-24 17…  60.5   98   2.35  30.8  18.2  8.5    0.1 11.8  126. 
## 2 AP001     2017-11-24 18…  65.5  111.  2.7   24.2  15.1  9.77   0.1 13.2  117. 
## 3 AP001     2017-11-24 19…  80    132   2.1   25.2  15.2 12.0    0.1 12.1   99.0
## 4 AP001     2017-11-24 20…  81.5  133.  1.95  16.2  10.2 11.6    0.1 10.5  112. 
## 5 AP001     2017-11-24 21…  75.2  116   1.43  17.5  10.4 12.0    0.1  9.12 106. 
## 6 AP001     2017-11-24 22…  69.2  108.  0.7   18.5  10.4 13.8    0.1  9.25  91.1
## # ℹ 19 more variables: Benzene <dbl>, Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## #   AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## #   NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## #   Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## #   AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_station_hour %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
##   AQI_NA tCO_mean CO_sd
##   <fct>     <dbl> <dbl>
## 1 !NA          NA    NA
## 2 NA           NA    NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(AQ_station_hour) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 499302 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_station_hour, aes(x = CO,y = AQI)) + geom_miss_point()

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
AQ_station_hour_imp <- impute_below_all(AQ_station_hour)
ggplot(AQ_station_hour_imp, aes(x = CO, y = AQI)) + geom_miss_point()

# But we need to track the imputed values as well
AQ_station_hour_imp_track <- bind_shadow(AQ_station_hour) %>% impute_below_all()
ggplot(AQ_station_hour_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_station_hour_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_station_hour_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
AQ_station_hour_imp_lm_temp <- AQ_station_hour %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()

ggplot(AQ_station_hour_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()

##Analysing and Performing Imputations on AQ_station_day - station_day.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_day)
## [1] 108035     16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_day)
## Rows: 108,035
## Columns: 16
## $ StationId  <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Date       <chr> "2017-11-24", "2017-11-25", "2017-11-26", "2017-11-27", "20…
## $ PM2.5      <dbl> 71.36, 81.40, 78.32, 88.76, 64.18, 72.47, 69.80, 73.96, 89.…
## $ PM10       <dbl> 115.75, 124.50, 129.06, 135.32, 104.09, 114.84, 114.86, 113…
## $ NO         <dbl> 1.75, 1.44, 1.26, 6.60, 2.56, 5.23, 4.69, 4.58, 7.71, 0.97,…
## $ NO2        <dbl> 20.65, 20.50, 26.00, 30.85, 28.07, 23.20, 20.17, 19.29, 26.…
## $ NOx        <dbl> 12.40, 12.08, 14.85, 21.77, 17.01, 16.59, 14.54, 13.97, 19.…
## $ NH3        <dbl> 12.19, 10.72, 10.28, 12.91, 11.42, 12.25, 10.95, 10.95, 13.…
## $ CO         <dbl> 0.10, 0.12, 0.14, 0.11, 0.09, 0.16, 0.12, 0.10, 0.10, 0.15,…
## $ SO2        <dbl> 10.76, 15.24, 26.96, 33.59, 19.00, 10.55, 14.07, 13.90, 19.…
## $ O3         <dbl> 109.26, 127.09, 117.44, 111.81, 138.18, 109.74, 118.09, 123…
## $ Benzene    <dbl> 0.17, 0.20, 0.22, 0.29, 0.17, 0.21, 0.16, 0.17, 0.25, 0.23,…
## $ Toluene    <dbl> 5.92, 6.50, 7.95, 7.63, 5.02, 4.71, 3.52, 2.85, 2.79, 3.82,…
## $ Xylene     <dbl> 0.10, 0.06, 0.08, 0.12, 0.07, 0.08, 0.06, 0.04, 0.07, 0.04,…
## $ AQI        <dbl> NA, 184, 197, 198, 188, 173, 165, 191, 191, 227, 168, 198, …
## $ AQI_Bucket <chr> "", "Moderate", "Moderate", "Moderate", "Moderate", "Modera…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_day)
##  [1] "StationId"  "Date"       "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_day)
## 'data.frame':    108035 obs. of  16 variables:
##  $ StationId : chr  "AP001" "AP001" "AP001" "AP001" ...
##  $ Date      : chr  "2017-11-24" "2017-11-25" "2017-11-26" "2017-11-27" ...
##  $ PM2.5     : num  71.4 81.4 78.3 88.8 64.2 ...
##  $ PM10      : num  116 124 129 135 104 ...
##  $ NO        : num  1.75 1.44 1.26 6.6 2.56 5.23 4.69 4.58 7.71 0.97 ...
##  $ NO2       : num  20.6 20.5 26 30.9 28.1 ...
##  $ NOx       : num  12.4 12.1 14.8 21.8 17 ...
##  $ NH3       : num  12.2 10.7 10.3 12.9 11.4 ...
##  $ CO        : num  0.1 0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 ...
##  $ SO2       : num  10.8 15.2 27 33.6 19 ...
##  $ O3        : num  109 127 117 112 138 ...
##  $ Benzene   : num  0.17 0.2 0.22 0.29 0.17 0.21 0.16 0.17 0.25 0.23 ...
##  $ Toluene   : num  5.92 6.5 7.95 7.63 5.02 4.71 3.52 2.85 2.79 3.82 ...
##  $ Xylene    : num  0.1 0.06 0.08 0.12 0.07 0.08 0.06 0.04 0.07 0.04 ...
##  $ AQI       : num  NA 184 197 198 188 173 165 191 191 227 ...
##  $ AQI_Bucket: chr  "" "Moderate" "Moderate" "Moderate" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_day)
##   StationId             Date               PM2.5              PM10        
##  Length:108035      Length:108035      Min.   :   0.02   Min.   :   0.01  
##  Class :character   Class :character   1st Qu.:  31.88   1st Qu.:  70.15  
##  Mode  :character   Mode  :character   Median :  55.95   Median : 122.09  
##                                        Mean   :  80.27   Mean   : 157.97  
##                                        3rd Qu.:  99.92   3rd Qu.: 208.67  
##                                        Max.   :1000.00   Max.   :1000.00  
##                                        NA's   :21625     NA's   :42706    
##        NO              NO2              NOx              NH3        
##  Min.   :  0.01   Min.   :  0.01   Min.   :  0.00   Min.   :  0.01  
##  1st Qu.:  4.84   1st Qu.: 15.09   1st Qu.: 13.97   1st Qu.: 11.90  
##  Median : 10.29   Median : 27.21   Median : 26.66   Median : 23.59  
##  Mean   : 23.12   Mean   : 35.24   Mean   : 41.20   Mean   : 28.73  
##  3rd Qu.: 24.98   3rd Qu.: 46.93   3rd Qu.: 50.50   3rd Qu.: 38.14  
##  Max.   :470.00   Max.   :448.05   Max.   :467.63   Max.   :418.90  
##  NA's   :17106    NA's   :16547    NA's   :15500    NA's   :48105   
##        CO               SO2               O3            Benzene       
##  Min.   :  0.000   Min.   :  0.01   Min.   :  0.01   Min.   :  0.000  
##  1st Qu.:  0.530   1st Qu.:  5.04   1st Qu.: 18.89   1st Qu.:  0.160  
##  Median :  0.910   Median :  8.95   Median : 30.84   Median :  1.210  
##  Mean   :  1.606   Mean   : 12.26   Mean   : 38.13   Mean   :  3.358  
##  3rd Qu.:  1.450   3rd Qu.: 14.92   3rd Qu.: 47.14   3rd Qu.:  3.610  
##  Max.   :175.810   Max.   :195.65   Max.   :963.00   Max.   :455.030  
##  NA's   :12998     NA's   :25204    NA's   :25568    NA's   :31455    
##     Toluene           Xylene            AQI          AQI_Bucket       
##  Min.   :  0.00   Min.   :  0.00   Min.   :   8.0   Length:108035     
##  1st Qu.:  0.69   1st Qu.:  0.00   1st Qu.:  86.0   Class :character  
##  Median :  4.33   Median :  0.40   Median : 132.0   Mode  :character  
##  Mean   : 15.35   Mean   :  2.42   Mean   : 179.7                     
##  3rd Qu.: 17.51   3rd Qu.:  2.11   3rd Qu.: 254.0                     
##  Max.   :454.85   Max.   :170.37   Max.   :2049.0                     
##  NA's   :38702    NA's   :85137    NA's   :21010
attach(AQ_station_day)
## The following objects are masked from AQ_station_hour:
## 
##     AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
##     SO2, StationId, Toluene, Xylene
## The following object is masked from AQ_stations:
## 
##     StationId
AQ_station_day [AQ_station_day == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 21625 PM10: 42706 NO: 17106 NO2: 16547 NOx: 15500 NH3: 48105 
## CO:  12998  SO2: 25204  O3: 25568 Benzene: 31455 Toluene: 38702 Xylene: 85137
AQ_station_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket       n
##   <chr>        <int>
## 1 Good          5510
## 2 Moderate     29417
## 3 Poor         11493
## 4 Satisfactory 23636
## 5 Severe        5207
## 6 Very Poor    11762
## 7 <NA>         21010
## Looks like Moderate entries are the highest ones, followed by Satisfactory 
## but third highest is NA entries...

AQ_station_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket       n
##   <chr>        <int>
## 1 Good          5510
## 2 Moderate     29417
## 3 Poor         11493
## 4 Satisfactory 23636
## 5 Severe        5207
## 6 Very Poor    11762
## 7 <NA>         21010
## Looks like Moderate entries are the highest ones but second highest is NA entries...
## Lets analyse the missing data of the dataset
n_miss(AQ_station_day) ## Total number of missing parameters
## [1] 422673
miss_var_summary(AQ_station_day) ## Missingness summary
## # A tibble: 16 × 3
##    variable   n_miss pct_miss
##    <chr>       <int>    <dbl>
##  1 Xylene      85137     78.8
##  2 NH3         48105     44.5
##  3 PM10        42706     39.5
##  4 Toluene     38702     35.8
##  5 Benzene     31455     29.1
##  6 O3          25568     23.7
##  7 SO2         25204     23.3
##  8 PM2.5       21625     20.0
##  9 AQI         21010     19.4
## 10 AQI_Bucket  21010     19.4
## 11 NO          17106     15.8
## 12 NO2         16547     15.3
## 13 NOx         15500     14.3
## 14 CO          12998     12.0
## 15 StationId       0      0  
## 16 Date            0      0
miss_var_span(AQ_station_day, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 433 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     22        228     0.088         0.912       250
##  2            2     32        218     0.128         0.872       250
##  3            3     56        194     0.224         0.776       250
##  4            4      9        241     0.036         0.964       250
##  5            5     26        224     0.104         0.896       250
##  6            6    188         62     0.752         0.248       250
##  7            7     24        226     0.096         0.904       250
##  8            8     19        231     0.076         0.924       250
##  9            9     20        230     0.08          0.92        250
## 10           10      9        241     0.036         0.964       250
## # ℹ 423 more rows
miss_var_table(AQ_station_day)
## # A tibble: 14 × 3
##    n_miss_in_var n_vars pct_vars
##            <int>  <int>    <dbl>
##  1             0      2    12.5 
##  2         12998      1     6.25
##  3         15500      1     6.25
##  4         16547      1     6.25
##  5         17106      1     6.25
##  6         21010      2    12.5 
##  7         21625      1     6.25
##  8         25204      1     6.25
##  9         25568      1     6.25
## 10         31455      1     6.25
## 11         38702      1     6.25
## 12         42706      1     6.25
## 13         48105      1     6.25
## 14         85137      1     6.25
## vis_miss(AQ_station_day) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_station_day) ## plot for missing data

gg_miss_fct(x = AQ_station_day, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).

gg_miss_span(AQ_station_day, var = AQI, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_station_day, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
##    Variable       n
##    <chr>      <int>
##  1 StationId      0
##  2 Date           0
##  3 PM2.5          0
##  4 PM10           0
##  5 NO             0
##  6 NO2            0
##  7 NOx            0
##  8 NH3            0
##  9 CO             0
## 10 SO2            0
## 11 O3             0
## 12 Benzene        0
## 13 Toluene        0
## 14 Xylene         0
## 15 AQI            0
## 16 AQI_Bucket 11762
##Create shadow matrix data
head(as_shadow(AQ_station_day))
## # A tibble: 6 × 16
##   StationId_NA Date_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA
##   <fct>        <fct>   <fct>    <fct>   <fct> <fct>  <fct>  <fct>  <fct> <fct> 
## 1 !NA          !NA     !NA      !NA     !NA   !NA    !NA    !NA    !NA   !NA   
## 2 !NA          !NA     !NA      !NA     !NA   !NA    !NA    !NA    !NA   !NA   
## 3 !NA          !NA     !NA      !NA     !NA   !NA    !NA    !NA    !NA   !NA   
## 4 !NA          !NA     !NA      !NA     !NA   !NA    !NA    !NA    !NA   !NA   
## 5 !NA          !NA     !NA      !NA     !NA   !NA    !NA    !NA    !NA   !NA   
## 6 !NA          !NA     !NA      !NA     !NA   !NA    !NA    !NA    !NA   !NA   
## # ℹ 6 more variables: O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## #   Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_station_day, only_miss = TRUE))
## # A tibble: 6 × 30
##   StationId Date   PM2.5  PM10    NO   NO2   NOx   NH3    CO   SO2    O3 Benzene
##   <chr>     <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 AP001     2017-…  71.4  116.  1.75  20.6  12.4  12.2  0.1   10.8  109.    0.17
## 2 AP001     2017-…  81.4  124.  1.44  20.5  12.1  10.7  0.12  15.2  127.    0.2 
## 3 AP001     2017-…  78.3  129.  1.26  26    14.8  10.3  0.14  27.0  117.    0.22
## 4 AP001     2017-…  88.8  135.  6.6   30.8  21.8  12.9  0.11  33.6  112.    0.29
## 5 AP001     2017-…  64.2  104.  2.56  28.1  17.0  11.4  0.09  19    138.    0.17
## 6 AP001     2017-…  72.5  115.  5.23  23.2  16.6  12.2  0.16  10.6  110.    0.21
## # ℹ 18 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## #   AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## #   NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## #   Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## #   AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_station_day %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
##   AQI_NA tCO_mean CO_sd
##   <fct>     <dbl> <dbl>
## 1 !NA          NA    NA
## 2 NA           NA    NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(AQ_station_day) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 12998 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_station_day, aes(x = CO,y = AQI)) + geom_miss_point()

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
AQ_station_day_imp <- impute_below_all(AQ_station_day)
ggplot(AQ_station_day_imp, aes(x = CO, y = AQI)) + geom_miss_point()

# But we need to track the imputed values as well
AQ_station_day_imp_track <- bind_shadow(AQ_station_day) %>% impute_below_all()
ggplot(AQ_station_day_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_station_day_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_station_day_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
AQ_station_day_imp_lm_temp <- AQ_station_day %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()

ggplot(AQ_station_day_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()

##Analysing and Performing Imputations on AQ_city_day <- read.csv(“./datasets/city_day.csv”)

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_day)
## [1] 29531    16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_day)
## Rows: 29,531
## Columns: 16
## $ City       <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Date       <chr> "2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "20…
## $ PM2.5      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO         <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ NO2        <dbl> 18.22, 15.69, 19.30, 18.48, 21.42, 38.48, 40.62, 36.74, 31.…
## $ NOx        <dbl> 17.15, 16.46, 29.70, 17.97, 37.76, 81.50, 130.77, 96.75, 48…
## $ NH3        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO         <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ SO2        <dbl> 27.64, 24.55, 29.07, 18.59, 39.33, 45.76, 32.28, 38.54, 58.…
## $ O3         <dbl> 133.36, 34.06, 30.70, 36.08, 39.31, 46.51, 33.47, 31.89, 25…
## $ Benzene    <dbl> 0.00, 3.68, 6.80, 4.43, 7.01, 5.42, 0.00, 0.00, 0.00, 0.00,…
## $ Toluene    <dbl> 0.02, 5.50, 16.40, 10.14, 18.89, 10.83, 0.00, 0.00, 0.00, 0…
## $ Xylene     <dbl> 0.00, 3.77, 2.25, 1.00, 2.78, 1.93, 0.00, 0.00, 0.00, 0.00,…
## $ AQI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_day)
##  [1] "City"       "Date"       "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_day)
## 'data.frame':    29531 obs. of  16 variables:
##  $ City      : chr  "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
##  $ Date      : chr  "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
##  $ PM2.5     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PM10      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NO        : num  0.92 0.97 17.4 1.7 22.1 ...
##  $ NO2       : num  18.2 15.7 19.3 18.5 21.4 ...
##  $ NOx       : num  17.1 16.5 29.7 18 37.8 ...
##  $ NH3       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CO        : num  0.92 0.97 17.4 1.7 22.1 ...
##  $ SO2       : num  27.6 24.6 29.1 18.6 39.3 ...
##  $ O3        : num  133.4 34.1 30.7 36.1 39.3 ...
##  $ Benzene   : num  0 3.68 6.8 4.43 7.01 5.42 0 0 0 0 ...
##  $ Toluene   : num  0.02 5.5 16.4 10.14 18.89 ...
##  $ Xylene    : num  0 3.77 2.25 1 2.78 1.93 0 0 0 0 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_day)
##      City               Date               PM2.5             PM10        
##  Length:29531       Length:29531       Min.   :  0.04   Min.   :   0.01  
##  Class :character   Class :character   1st Qu.: 28.82   1st Qu.:  56.26  
##  Mode  :character   Mode  :character   Median : 48.57   Median :  95.68  
##                                        Mean   : 67.45   Mean   : 118.13  
##                                        3rd Qu.: 80.59   3rd Qu.: 149.75  
##                                        Max.   :949.99   Max.   :1000.00  
##                                        NA's   :4598     NA's   :11140    
##        NO              NO2              NOx              NH3        
##  Min.   :  0.02   Min.   :  0.01   Min.   :  0.00   Min.   :  0.01  
##  1st Qu.:  5.63   1st Qu.: 11.75   1st Qu.: 12.82   1st Qu.:  8.58  
##  Median :  9.89   Median : 21.69   Median : 23.52   Median : 15.85  
##  Mean   : 17.57   Mean   : 28.56   Mean   : 32.31   Mean   : 23.48  
##  3rd Qu.: 19.95   3rd Qu.: 37.62   3rd Qu.: 40.13   3rd Qu.: 30.02  
##  Max.   :390.68   Max.   :362.21   Max.   :467.63   Max.   :352.89  
##  NA's   :3582     NA's   :3585     NA's   :4185     NA's   :10328   
##        CO               SO2               O3            Benzene       
##  Min.   :  0.000   Min.   :  0.01   Min.   :  0.01   Min.   :  0.000  
##  1st Qu.:  0.510   1st Qu.:  5.67   1st Qu.: 18.86   1st Qu.:  0.120  
##  Median :  0.890   Median :  9.16   Median : 30.84   Median :  1.070  
##  Mean   :  2.249   Mean   : 14.53   Mean   : 34.49   Mean   :  3.281  
##  3rd Qu.:  1.450   3rd Qu.: 15.22   3rd Qu.: 45.57   3rd Qu.:  3.080  
##  Max.   :175.810   Max.   :193.86   Max.   :257.73   Max.   :455.030  
##  NA's   :2059      NA's   :3854     NA's   :4022     NA's   :5623     
##     Toluene            Xylene            AQI          AQI_Bucket       
##  Min.   :  0.000   Min.   :  0.00   Min.   :  13.0   Length:29531      
##  1st Qu.:  0.600   1st Qu.:  0.14   1st Qu.:  81.0   Class :character  
##  Median :  2.970   Median :  0.98   Median : 118.0   Mode  :character  
##  Mean   :  8.701   Mean   :  3.07   Mean   : 166.5                     
##  3rd Qu.:  9.150   3rd Qu.:  3.35   3rd Qu.: 208.0                     
##  Max.   :454.850   Max.   :170.37   Max.   :2049.0                     
##  NA's   :8041      NA's   :18109    NA's   :4681
attach(AQ_city_day)
## The following objects are masked from AQ_station_day:
## 
##     AQI, AQI_Bucket, Benzene, CO, Date, NH3, NO, NO2, NOx, O3, PM10,
##     PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
## 
##     AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
##     SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
## 
##     City
AQ_city_day [AQ_city_day == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:

## PM2.5: 4598  PM10: 11140  NO: 3582  NO2: 3585 NOx: 4185 NH3: 10328 
## CO: 2059  SO2: 3854 O3: 4022  Benzene: 5623  Toluene: 8041 Xylene: 18109
AQ_city_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket       n
##   <chr>        <int>
## 1 Good          1341
## 2 Moderate      8829
## 3 Poor          2781
## 4 Satisfactory  8224
## 5 Severe        1338
## 6 Very Poor     2337
## 7 <NA>          4681
## Looks like Moderate entries are the highest ones, followed by Satisfactory 
## but third highest is NA entries...

AQ_city_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket       n
##   <chr>        <int>
## 1 Good          1341
## 2 Moderate      8829
## 3 Poor          2781
## 4 Satisfactory  8224
## 5 Severe        1338
## 6 Very Poor     2337
## 7 <NA>          4681
## Looks like Moderate entries are the highest ones but second highest is NA entries...
## Lets analyse the missing data of the dataset
n_miss(AQ_city_day) ## Total number of missing parameters
## [1] 88488
miss_var_summary(AQ_city_day) ## Missingness summary
## # A tibble: 16 × 3
##    variable   n_miss pct_miss
##    <chr>       <int>    <dbl>
##  1 Xylene      18109    61.3 
##  2 PM10        11140    37.7 
##  3 NH3         10328    35.0 
##  4 Toluene      8041    27.2 
##  5 Benzene      5623    19.0 
##  6 AQI          4681    15.9 
##  7 AQI_Bucket   4681    15.9 
##  8 PM2.5        4598    15.6 
##  9 NOx          4185    14.2 
## 10 O3           4022    13.6 
## 11 SO2          3854    13.1 
## 12 NO2          3585    12.1 
## 13 NO           3582    12.1 
## 14 CO           2059     6.97
## 15 City            0     0   
## 16 Date            0     0
miss_var_span(AQ_city_day, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 119 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1     41        209     0.164         0.836       250
##  2            2    196         54     0.784         0.216       250
##  3            3    133        117     0.532         0.468       250
##  4            4    250          0     1             0           250
##  5            5     32        218     0.128         0.872       250
##  6            6      3        247     0.012         0.988       250
##  7            7     13        237     0.052         0.948       250
##  8            8      7        243     0.028         0.972       250
##  9            9      4        246     0.016         0.984       250
## 10           10     49        201     0.196         0.804       250
## # ℹ 109 more rows
miss_var_table(AQ_city_day)
## # A tibble: 14 × 3
##    n_miss_in_var n_vars pct_vars
##            <int>  <int>    <dbl>
##  1             0      2    12.5 
##  2          2059      1     6.25
##  3          3582      1     6.25
##  4          3585      1     6.25
##  5          3854      1     6.25
##  6          4022      1     6.25
##  7          4185      1     6.25
##  8          4598      1     6.25
##  9          4681      2    12.5 
## 10          5623      1     6.25
## 11          8041      1     6.25
## 12         10328      1     6.25
## 13         11140      1     6.25
## 14         18109      1     6.25
## vis_miss(AQ_city_day) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_city_day) ## plot for missing data

gg_miss_fct(x = AQ_city_day, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).

gg_miss_span(AQ_city_day, var = AQI, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_city_day, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
##    Variable       n
##    <chr>      <int>
##  1 City        7541
##  2 Date           0
##  3 PM2.5          0
##  4 PM10           0
##  5 NO             0
##  6 NO2            0
##  7 NOx            0
##  8 NH3            0
##  9 CO             0
## 10 SO2            0
## 11 O3             0
## 12 Benzene        0
## 13 Toluene        0
## 14 Xylene         0
## 15 AQI            0
## 16 AQI_Bucket  2337
##Create shadow matrix data
head(as_shadow(AQ_city_day))
## # A tibble: 6 × 16
##   City_NA Date_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA O3_NA
##   <fct>   <fct>   <fct>    <fct>   <fct> <fct>  <fct>  <fct>  <fct> <fct>  <fct>
## 1 !NA     !NA     NA       NA      !NA   !NA    !NA    NA     !NA   !NA    !NA  
## 2 !NA     !NA     NA       NA      !NA   !NA    !NA    NA     !NA   !NA    !NA  
## 3 !NA     !NA     NA       NA      !NA   !NA    !NA    NA     !NA   !NA    !NA  
## 4 !NA     !NA     NA       NA      !NA   !NA    !NA    NA     !NA   !NA    !NA  
## 5 !NA     !NA     NA       NA      !NA   !NA    !NA    NA     !NA   !NA    !NA  
## 6 !NA     !NA     NA       NA      !NA   !NA    !NA    NA     !NA   !NA    !NA  
## # ℹ 5 more variables: Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>,
## #   AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_city_day, only_miss = TRUE))
## # A tibble: 6 × 30
##   City      Date   PM2.5  PM10    NO   NO2   NOx   NH3    CO   SO2    O3 Benzene
##   <chr>     <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Ahmedabad 2015-…    NA    NA  0.92  18.2  17.2    NA  0.92  27.6 133.     0   
## 2 Ahmedabad 2015-…    NA    NA  0.97  15.7  16.5    NA  0.97  24.6  34.1    3.68
## 3 Ahmedabad 2015-…    NA    NA 17.4   19.3  29.7    NA 17.4   29.1  30.7    6.8 
## 4 Ahmedabad 2015-…    NA    NA  1.7   18.5  18.0    NA  1.7   18.6  36.1    4.43
## 5 Ahmedabad 2015-…    NA    NA 22.1   21.4  37.8    NA 22.1   39.3  39.3    7.01
## 6 Ahmedabad 2015-…    NA    NA 45.4   38.5  81.5    NA 45.4   45.8  46.5    5.42
## # ℹ 18 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## #   AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## #   NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## #   Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## #   AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_city_day %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
##   AQI_NA tCO_mean CO_sd
##   <fct>     <dbl> <dbl>
## 1 !NA          NA    NA
## 2 NA           NA    NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(AQ_city_day) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 2059 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_city_day, aes(x = CO,y = AQI)) + geom_miss_point()

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
AQ_city_day_imp <- impute_below_all(AQ_city_day)
ggplot(AQ_city_day_imp, aes(x = CO, y = AQI)) + geom_miss_point()

# But we need to track the imputed values as well
AQ_city_day_imp_track <- bind_shadow(AQ_city_day) %>% impute_below_all()
ggplot(AQ_city_day_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_city_day_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_city_day_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
AQ_city_day_imp_lm_temp <- AQ_city_day %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()

ggplot(AQ_city_day_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()

##Analysing AQ_city_hour: city_hour.csv

## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_hour)
## [1] 707875     16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_hour)
## Rows: 707,875
## Columns: 16
## $ City       <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Datetime   <chr> "2015-01-01 01:00:00", "2015-01-01 02:00:00", "2015-01-01 0…
## $ PM2.5      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO         <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ NO2        <dbl> 40.01, 27.75, 19.32, 16.45, 14.90, 15.95, 15.94, 16.66, 16.…
## $ NOx        <dbl> 36.37, 19.73, 11.08, 9.20, 7.85, 10.82, 12.47, 16.48, 18.02…
## $ NH3        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO         <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ SO2        <dbl> 122.07, 85.90, 52.83, 39.53, 32.63, 29.87, 27.41, 20.92, 16…
## $ O3         <dbl> NA, NA, NA, 153.58, NA, 64.25, 191.96, 177.21, 122.08, NA, …
## $ Benzene    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Toluene    <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Xylene     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ AQI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_hour)
##  [1] "City"       "Datetime"   "PM2.5"      "PM10"       "NO"        
##  [6] "NO2"        "NOx"        "NH3"        "CO"         "SO2"       
## [11] "O3"         "Benzene"    "Toluene"    "Xylene"     "AQI"       
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_hour)
## 'data.frame':    707875 obs. of  16 variables:
##  $ City      : chr  "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
##  $ Datetime  : chr  "2015-01-01 01:00:00" "2015-01-01 02:00:00" "2015-01-01 03:00:00" "2015-01-01 04:00:00" ...
##  $ PM2.5     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PM10      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NO        : num  1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
##  $ NO2       : num  40 27.8 19.3 16.4 14.9 ...
##  $ NOx       : num  36.37 19.73 11.08 9.2 7.85 ...
##  $ NH3       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CO        : num  1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
##  $ SO2       : num  122.1 85.9 52.8 39.5 32.6 ...
##  $ O3        : num  NA NA NA 154 NA ...
##  $ Benzene   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Toluene   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Xylene    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_hour)
##      City             Datetime             PM2.5              PM10        
##  Length:707875      Length:707875      Min.   :   0.01   Min.   :   0.01  
##  Class :character   Class :character   1st Qu.:  26.20   1st Qu.:  52.38  
##  Mode  :character   Mode  :character   Median :  46.42   Median :  91.50  
##                                        Mean   :  67.62   Mean   : 119.08  
##                                        3rd Qu.:  79.49   3rd Qu.: 147.52  
##                                        Max.   : 999.99   Max.   :1000.00  
##                                        NA's   :145088    NA's   :296737   
##        NO              NO2              NOx              NH3        
##  Min.   :  0.01   Min.   :  0.01   Min.   :  0.00   Min.   :  0.01  
##  1st Qu.:  3.84   1st Qu.: 10.81   1st Qu.: 10.66   1st Qu.:  8.12  
##  Median :  7.96   Median : 20.32   Median : 20.79   Median : 15.38  
##  Mean   : 17.42   Mean   : 28.89   Mean   : 32.29   Mean   : 23.61  
##  3rd Qu.: 16.15   3rd Qu.: 36.35   3rd Qu.: 37.15   3rd Qu.: 29.23  
##  Max.   :499.99   Max.   :499.51   Max.   :498.61   Max.   :499.97  
##  NA's   :116632   NA's   :117122   NA's   :123224   NA's   :272542  
##        CO              SO2               O3            Benzene      
##  Min.   :  0.00   Min.   :  0.01   Min.   :  0.01   Min.   :  0.00  
##  1st Qu.:  0.42   1st Qu.:  4.88   1st Qu.: 13.42   1st Qu.:  0.05  
##  Median :  0.80   Median :  8.37   Median : 26.24   Median :  0.86  
##  Mean   :  2.18   Mean   : 14.04   Mean   : 34.80   Mean   :  3.09  
##  3rd Qu.:  1.37   3rd Qu.: 14.78   3rd Qu.: 47.62   3rd Qu.:  2.75  
##  Max.   :498.57   Max.   :199.96   Max.   :497.62   Max.   :498.07  
##  NA's   :86517    NA's   :130373   NA's   :129208   NA's   :163646  
##     Toluene           Xylene            AQI          AQI_Bucket       
##  Min.   :  0.00   Min.   :  0.0    Min.   :   8.0   Length:707875     
##  1st Qu.:  0.37   1st Qu.:  0.1    1st Qu.:  79.0   Class :character  
##  Median :  2.59   Median :  0.8    Median : 116.0   Mode  :character  
##  Mean   :  8.66   Mean   :  3.1    Mean   : 166.4                     
##  3rd Qu.:  8.41   3rd Qu.:  3.1    3rd Qu.: 208.0                     
##  Max.   :499.40   Max.   :500.0    Max.   :3133.0                     
##  NA's   :220607   NA's   :455829   NA's   :129080
attach(AQ_city_hour)
## The following objects are masked from AQ_city_day:
## 
##     AQI, AQI_Bucket, Benzene, CO, City, NH3, NO, NO2, NOx, O3, PM10,
##     PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_day:
## 
##     AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
##     SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
## 
##     AQI, AQI_Bucket, Benzene, CO, Datetime, NH3, NO, NO2, NOx, O3,
##     PM10, PM2.5, SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
## 
##     City
AQ_city_hour [AQ_city_hour == ""] <- NA

## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 145088 PM10: 296737 NO: 116632   NO2: 117122  NOx: 123224 NH3:  272542     
## CO: 86517  SO2: 130373   O3: 129208 Benzene: 163646   Toluene: 220607 Xylene: 455829   
AQ_city_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups:   AQI_Bucket [7]
##   AQI_Bucket        n
##   <chr>         <int>
## 1 Good          38611
## 2 Moderate     198991
## 3 Poor          66654
## 4 Satisfactory 189434
## 5 Severe        27650
## 6 Very Poor     57455
## 7 <NA>         129080
## Looks like Moderate entries are the highest ones, followed by Satisfactory 
## but third highest is NA entries...

n_miss(AQ_city_hour) ## Total number of missing parameters
## [1] 2515685
miss_var_summary(AQ_city_hour) ## Missingness summary
## # A tibble: 16 × 3
##    variable   n_miss pct_miss
##    <chr>       <int>    <dbl>
##  1 Xylene     455829     64.4
##  2 PM10       296737     41.9
##  3 NH3        272542     38.5
##  4 Toluene    220607     31.2
##  5 Benzene    163646     23.1
##  6 PM2.5      145088     20.5
##  7 SO2        130373     18.4
##  8 O3         129208     18.3
##  9 AQI        129080     18.2
## 10 AQI_Bucket 129080     18.2
## 11 NOx        123224     17.4
## 12 NO2        117122     16.5
## 13 NO         116632     16.5
## 14 CO          86517     12.2
## 15 City            0      0  
## 16 Datetime        0      0
miss_var_span(AQ_city_hour, var = AQI, span_every = 250) ## Missingness spread
## # A tibble: 2,832 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1    250          0     1             0           250
##  2            2    250          0     1             0           250
##  3            3    180         70     0.72          0.28        250
##  4            4      0        250     0             1           250
##  5            5      0        250     0             1           250
##  6            6    102        148     0.408         0.592       250
##  7            7     54        196     0.216         0.784       250
##  8            8      0        250     0             1           250
##  9            9      0        250     0             1           250
## 10           10     74        176     0.296         0.704       250
## # ℹ 2,822 more rows
miss_var_table(AQ_city_hour)
## # A tibble: 14 × 3
##    n_miss_in_var n_vars pct_vars
##            <int>  <int>    <dbl>
##  1             0      2    12.5 
##  2         86517      1     6.25
##  3        116632      1     6.25
##  4        117122      1     6.25
##  5        123224      1     6.25
##  6        129080      2    12.5 
##  7        129208      1     6.25
##  8        130373      1     6.25
##  9        145088      1     6.25
## 10        163646      1     6.25
## 11        220607      1     6.25
## 12        272542      1     6.25
## 13        296737      1     6.25
## 14        455829      1     6.25
## vis_miss(AQ_city_hour) Unable to visualise % of missing due to large data size
gg_miss_upset(AQ_city_hour) ## plot for missing data

gg_miss_fct(x = AQ_city_hour, fct = AQI) ## Heat map of missingness
## Warning: Removed 15 rows containing missing values (`geom_tile()`).

gg_miss_span(AQ_city_hour, var = AQI, span_every = 250) ## Visualize span of prcp missingness

## With this we can clearly see the precipitation data is missing a lot (39%), followed by tmin
miss_scan_count(data = AQ_city_hour, search = list("N/A", "NA", "na", " ","missing")) ## No empty cells
## # A tibble: 16 × 2
##    Variable        n
##    <chr>       <int>
##  1 City       180770
##  2 Datetime   707875
##  3 PM2.5           0
##  4 PM10            0
##  5 NO              0
##  6 NO2             0
##  7 NOx             0
##  8 NH3             0
##  9 CO              0
## 10 SO2             0
## 11 O3              0
## 12 Benzene         0
## 13 Toluene         0
## 14 Xylene          0
## 15 AQI             0
## 16 AQI_Bucket  57455
##Create shadow matrix data
head(as_shadow(AQ_city_hour))
## # A tibble: 6 × 16
##   City_NA Datetime_NA PM2.5_NA PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA
##   <fct>   <fct>       <fct>    <fct>   <fct> <fct>  <fct>  <fct>  <fct> <fct> 
## 1 !NA     !NA         NA       NA      !NA   !NA    !NA    NA     !NA   !NA   
## 2 !NA     !NA         NA       NA      !NA   !NA    !NA    NA     !NA   !NA   
## 3 !NA     !NA         NA       NA      !NA   !NA    !NA    NA     !NA   !NA   
## 4 !NA     !NA         NA       NA      !NA   !NA    !NA    NA     !NA   !NA   
## 5 !NA     !NA         NA       NA      !NA   !NA    !NA    NA     !NA   !NA   
## 6 !NA     !NA         NA       NA      !NA   !NA    !NA    NA     !NA   !NA   
## # ℹ 6 more variables: O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## #   Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>
#Create nabular data by binding the shadow to the data
head(bind_shadow(AQ_city_hour, only_miss = TRUE))
## # A tibble: 6 × 30
##   City    Datetime PM2.5  PM10    NO   NO2   NOx   NH3    CO   SO2    O3 Benzene
##   <chr>   <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Ahmeda… 2015-01…    NA    NA  1     40.0 36.4     NA  1    122.   NA         0
## 2 Ahmeda… 2015-01…    NA    NA  0.02  27.8 19.7     NA  0.02  85.9  NA         0
## 3 Ahmeda… 2015-01…    NA    NA  0.08  19.3 11.1     NA  0.08  52.8  NA         0
## 4 Ahmeda… 2015-01…    NA    NA  0.3   16.4  9.2     NA  0.3   39.5 154.        0
## 5 Ahmeda… 2015-01…    NA    NA  0.12  14.9  7.85    NA  0.12  32.6  NA         0
## 6 Ahmeda… 2015-01…    NA    NA  0.33  16.0 10.8     NA  0.33  29.9  64.2       0
## # ℹ 18 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## #   AQI_Bucket <chr>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>,
## #   NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>,
## #   Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>, AQI_NA <fct>,
## #   AQI_Bucket_NA <fct>
# Lets explore the relations ship with the missing values
AQ_city_hour %>%
bind_shadow(only_miss = TRUE) %>%
group_by(AQI_NA) %>%
summarise(tCO_mean = mean(CO),CO_sd = sd(CO))
## # A tibble: 2 × 3
##   AQI_NA tCO_mean CO_sd
##   <fct>     <dbl> <dbl>
## 1 !NA          NA    NA
## 2 NA           NA    NA
# After adding NA, there the SD and mean has also become NA

bind_shadow(AQ_city_hour) %>%
ggplot(aes(x = CO,
color = AQI_NA)) +
geom_density() +
facet_wrap(~O3_NA)
## Warning: Removed 86517 rows containing non-finite values (`stat_density()`).

# Explore the missingness in precipitation and air temperature, and display the missingness using `geom_miss_point'
ggplot(AQ_city_hour, aes(x = CO,y = AQI)) + geom_miss_point()

# Looks like there are not too much of missing data

# We would like to impute all the missing data with value below the range by 10%
AQ_city_hour_imp <- impute_below_all(AQ_city_hour)
ggplot(AQ_city_hour_imp, aes(x = CO, y = AQI)) + geom_miss_point()

# But we need to track the imputed values as well
AQ_city_hour_imp_track <- bind_shadow(AQ_city_hour) %>% impute_below_all()
ggplot(AQ_city_hour_imp_track, aes(x = AQI, fill = AQI_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_city_hour_imp_track, aes(x = O3, fill = O3_NA)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(AQ_city_hour_imp_track, aes(x = CO, y = AQI, color = AQI_NA)) + geom_point()

## So we can successfully imputed all the NA values here

# Now lets fix the important the critically missing parameters prcp and lm 
# via linear regression mechanism in relationship with other explanatory parameters
AQ_city_hour_imp_lm_temp <- AQ_city_hour %>% bind_shadow() %>% impute_lm(AQI ~ CO + O3) %>% impute_lm(O3 ~ CO) %>% add_label_shadow()

ggplot(AQ_city_hour_imp_lm_temp, aes(x = CO, y = AQI, color = any_missing)) + geom_miss_point()

##Analysing Airport_delay: Aiport_Delay.csv

## Have a look at the data

print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Airport_delay)
## [1] 14952    22
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Airport_delay)
## Rows: 14,952
## Columns: 22
## $ Date                                            <chr> "28-1-18", "28-1-18", …
## $ Departure.Airport                               <chr> "BLR", "CCU", "DEL", "…
## $ Departure.Airport.Rating..out.of.10.            <dbl> NA, NA, 7.99, 7.29, NA…
## $ Departure.Airport.On.Time.Rating..out.of.10.    <dbl> NA, NA, 7.3, 6.2, NA, …
## $ Departure.Airport.Service.Rating..out.of.10.    <dbl> NA, NA, 9.1, 9.0, NA, …
## $ Arrival.Airport                                 <chr> "DEL", "DEL", "HYD", "…
## $ Arrival.Airport.Rating..out.of.10.              <dbl> 7.99, 7.99, 8.27, 7.99…
## $ Arrival.Airport.On.Time.Rating..out.of.10.      <dbl> 7.3, 7.3, 7.8, 7.3, 6.…
## $ Arrival.Airport.Service.Rating..out.of.10.      <dbl> 9.1, 9.1, 9.0, 9.1, 9.…
## $ Airplane.Type                                   <chr> "", "", "", "", "", "A…
## $ Expected.Departure.Time                         <chr> "6:10", "7:00", "7:05"…
## $ Departure.Time                                  <chr> "6:10", "7:01", "7:33"…
## $ Departure.Delay                                 <chr> "0:00:00", "0:01:00", …
## $ Duration                                        <chr> "2:20", "2:09", "1:46"…
## $ Expected.Arrival.Time                           <chr> "8:55", "9:10", "9:10"…
## $ Arrival.Time                                    <chr> "8:30", "9:10", "9:19"…
## $ Arrival.Time.Delay                              <chr> "-0:25:00", "0:00:00",…
## $ Carrier                                         <chr> "Air India", "Air Indi…
## $ Carrier.Rating..out.of.10.                      <dbl> 6.6, 6.6, 6.6, 6.6, 6.…
## $ Carrier.Market.Share..out.of.100.               <dbl> 12.0, 12.0, 12.0, 12.0…
## $ Carrier.Load.Factor..out.of.100.                <dbl> 80.75, 80.75, 80.75, 8…
## $ Carrier.On.Time.Performance.Rating..out.of.100. <dbl> 70.3, 70.3, 70.3, 70.3…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Airport_delay)
##  [1] "Date"                                           
##  [2] "Departure.Airport"                              
##  [3] "Departure.Airport.Rating..out.of.10."           
##  [4] "Departure.Airport.On.Time.Rating..out.of.10."   
##  [5] "Departure.Airport.Service.Rating..out.of.10."   
##  [6] "Arrival.Airport"                                
##  [7] "Arrival.Airport.Rating..out.of.10."             
##  [8] "Arrival.Airport.On.Time.Rating..out.of.10."     
##  [9] "Arrival.Airport.Service.Rating..out.of.10."     
## [10] "Airplane.Type"                                  
## [11] "Expected.Departure.Time"                        
## [12] "Departure.Time"                                 
## [13] "Departure.Delay"                                
## [14] "Duration"                                       
## [15] "Expected.Arrival.Time"                          
## [16] "Arrival.Time"                                   
## [17] "Arrival.Time.Delay"                             
## [18] "Carrier"                                        
## [19] "Carrier.Rating..out.of.10."                     
## [20] "Carrier.Market.Share..out.of.100."              
## [21] "Carrier.Load.Factor..out.of.100."               
## [22] "Carrier.On.Time.Performance.Rating..out.of.100."
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Airport_delay)
## 'data.frame':    14952 obs. of  22 variables:
##  $ Date                                           : chr  "28-1-18" "28-1-18" "28-1-18" "28-1-18" ...
##  $ Departure.Airport                              : chr  "BLR" "CCU" "DEL" "BOM" ...
##  $ Departure.Airport.Rating..out.of.10.           : num  NA NA 7.99 7.29 NA 7.99 NA NA 7.99 NA ...
##  $ Departure.Airport.On.Time.Rating..out.of.10.   : num  NA NA 7.3 6.2 NA 7.3 NA NA 7.3 NA ...
##  $ Departure.Airport.Service.Rating..out.of.10.   : num  NA NA 9.1 9 NA 9.1 NA NA 9.1 NA ...
##  $ Arrival.Airport                                : chr  "DEL" "DEL" "HYD" "DEL" ...
##  $ Arrival.Airport.Rating..out.of.10.             : num  7.99 7.99 8.27 7.99 7.29 8.27 7.29 7.99 8.27 7.29 ...
##  $ Arrival.Airport.On.Time.Rating..out.of.10.     : num  7.3 7.3 7.8 7.3 6.2 7.8 6.2 7.3 7.8 6.2 ...
##  $ Arrival.Airport.Service.Rating..out.of.10.     : num  9.1 9.1 9 9.1 9 9 9 9.1 9 9 ...
##  $ Airplane.Type                                  : chr  "" "" "" "" ...
##  $ Expected.Departure.Time                        : chr  "6:10" "7:00" "7:05" "7:00" ...
##  $ Departure.Time                                 : chr  "6:10" "7:01" "7:33" "7:07" ...
##  $ Departure.Delay                                : chr  "0:00:00" "0:01:00" "0:28:00" "0:07:00" ...
##  $ Duration                                       : chr  "2:20" "2:09" "1:46" "1:40" ...
##  $ Expected.Arrival.Time                          : chr  "8:55" "9:10" "9:10" "9:05" ...
##  $ Arrival.Time                                   : chr  "8:30" "9:10" "9:19" "8:47" ...
##  $ Arrival.Time.Delay                             : chr  "-0:25:00" "0:00:00" "0:09:00" "-0:18:00" ...
##  $ Carrier                                        : chr  "Air India" "Air India" "Air India" "Air India" ...
##  $ Carrier.Rating..out.of.10.                     : num  6.6 6.6 6.6 6.6 6.6 7.2 7.2 7.9 7.9 7.9 ...
##  $ Carrier.Market.Share..out.of.100.              : num  12 12 12 12 12 8.8 8.8 39.7 39.7 39.7 ...
##  $ Carrier.Load.Factor..out.of.100.               : num  80.8 80.8 80.8 80.8 80.8 ...
##  $ Carrier.On.Time.Performance.Rating..out.of.100.: num  70.3 70.3 70.3 70.3 70.3 91.8 91.8 87.4 87.4 87.4 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Airport_delay)
##      Date           Departure.Airport  Departure.Airport.Rating..out.of.10.
##  Length:14952       Length:14952       Min.   :7.290                       
##  Class :character   Class :character   1st Qu.:7.290                       
##  Mode  :character   Mode  :character   Median :7.990                       
##                                        Mean   :7.741                       
##                                        3rd Qu.:7.990                       
##                                        Max.   :8.270                       
##                                        NA's   :10043                       
##  Departure.Airport.On.Time.Rating..out.of.10.
##  Min.   :6.200                               
##  1st Qu.:6.200                               
##  Median :7.300                               
##  Mean   :6.908                               
##  3rd Qu.:7.300                               
##  Max.   :7.800                               
##  NA's   :10043                               
##  Departure.Airport.Service.Rating..out.of.10. Arrival.Airport   
##  Min.   :9.000                                Length:14952      
##  1st Qu.:9.000                                Class :character  
##  Median :9.100                                Mode  :character  
##  Mean   :9.064                                                  
##  3rd Qu.:9.100                                                  
##  Max.   :9.100                                                  
##  NA's   :10043                                                  
##  Arrival.Airport.Rating..out.of.10. Arrival.Airport.On.Time.Rating..out.of.10.
##  Min.   :7.29                       Min.   :6.200                             
##  1st Qu.:7.99                       1st Qu.:7.300                             
##  Median :7.99                       Median :7.300                             
##  Mean   :7.91                       Mean   :7.187                             
##  3rd Qu.:7.99                       3rd Qu.:7.300                             
##  Max.   :8.27                       Max.   :7.800                             
##                                                                               
##  Arrival.Airport.Service.Rating..out.of.10. Airplane.Type     
##  Min.   :9.000                              Length:14952      
##  1st Qu.:9.000                              Class :character  
##  Median :9.100                              Mode  :character  
##  Mean   :9.059                                                
##  3rd Qu.:9.100                                                
##  Max.   :9.100                                                
##                                                               
##  Expected.Departure.Time Departure.Time     Departure.Delay   
##  Length:14952            Length:14952       Length:14952      
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##                                                               
##    Duration         Expected.Arrival.Time Arrival.Time       Arrival.Time.Delay
##  Length:14952       Length:14952          Length:14952       Length:14952      
##  Class :character   Class :character      Class :character   Class :character  
##  Mode  :character   Mode  :character      Mode  :character   Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##    Carrier          Carrier.Rating..out.of.10.
##  Length:14952       Min.   :6.600             
##  Class :character   1st Qu.:6.800             
##  Mode  :character   Median :7.200             
##                     Mean   :7.531             
##                     3rd Qu.:7.900             
##                     Max.   :9.200             
##                                               
##  Carrier.Market.Share..out.of.100. Carrier.Load.Factor..out.of.100.
##  Min.   : 3.6                      Min.   :80.75                   
##  1st Qu.: 4.0                      1st Qu.:81.80                   
##  Median :12.0                      Median :86.00                   
##  Mean   :13.2                      Mean   :86.88                   
##  3rd Qu.:13.1                      3rd Qu.:93.30                   
##  Max.   :39.7                      Max.   :93.90                   
##                                                                    
##  Carrier.On.Time.Performance.Rating..out.of.100.
##  Min.   :70.30                                  
##  1st Qu.:74.70                                  
##  Median :87.40                                  
##  Mean   :83.14                                  
##  3rd Qu.:89.10                                  
##  Max.   :91.80                                  
## 
attach(Airport_delay)
## The following object is masked from AQ_city_day:
## 
##     Date
## The following object is masked from AQ_station_day:
## 
##     Date
Airport_delay [Airport_delay == ""] <- NA

Airport_delay %>% group_by(Departure.Airport, Departure.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Departure.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 5 × 2
## # Groups:   Departure.Airport [5]
##   Departure.Airport Departure.Airport.On.Time.Rating..out.of.10.
##   <chr>                                                    <dbl>
## 1 BLR                                                       NA  
## 2 BOM                                                        6.2
## 3 CCU                                                       NA  
## 4 DEL                                                        7.3
## 5 HYD                                                        7.8
##Mumbai seems to have the worst rating for departure on time performance

Airport_delay %>% group_by(Arrival.Airport, Arrival.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Arrival.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 3 × 2
## # Groups:   Arrival.Airport [3]
##   Arrival.Airport Arrival.Airport.On.Time.Rating..out.of.10.
##   <chr>                                                <dbl>
## 1 BOM                                                    6.2
## 2 DEL                                                    7.3
## 3 HYD                                                    7.8
##Mumbai seems to have the worst rating for Arrival on time performance as well

Further Cleaning Datasets over imputations done previously

## Remove the entries from the table where tavg is NA
New_Weather_Bangalore <- Weather_Bangalore_imp_lm_temp[complete.cases(Weather_Bangalore),]
New_Weather_Chennai <- Weather_Chennai_imp_lm_temp[complete.cases(Weather_Chennai),]
New_Weather_Delhi <- Weather_Delhi_imp_lm_temp[complete.cases(Weather_Delhi),]
New_Weather_Lucknow <- Weather_Lucknow_imp_lm_temp[complete.cases(Weather_Lucknow),]
New_Weather_Mumbai <- Weather_Mumbai_imp_lm_temp[complete.cases(Weather_Mumbai),]
New_Weather_Jodhpur <- Weather_Jodhpur_imp_lm_temp[complete.cases(Weather_Jodhpur),]

## For Bhubhenshwar and Rourkela, we need to first remove the columns snow and tsun which has no valid entries
## We can also remove the wdir, wspd, pressure columns as the other stations are not having them
## And hence having them does not seem to add value for the scope of this analysis
Standard_Weather_Bhubhneshwar <- subset(Weather_Bhubhneshwar, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Bhubhneshwar <- Standard_Weather_Bhubhneshwar[complete.cases(Standard_Weather_Bhubhneshwar),] 

Standard_Weather_Rourkela <- subset(Weather_Rourkela, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Rourkela <- Standard_Weather_Rourkela[complete.cases(Standard_Weather_Rourkela),] 


## When it comes to AQI stations, we need only active stations

New_AQ_stations <- AQ_stations %>% filter(Status == "Active")
New_AQ_station_hour <- AQ_station_hour_imp_lm_temp[complete.cases(AQ_station_hour),]
New_AQ_station_day <- AQ_station_day_imp_lm_temp[complete.cases(AQ_station_day),]
New_AQ_city_hour <- AQ_city_hour_imp_lm_temp[complete.cases(AQ_city_hour),]
New_AQ_city_day <- AQ_city_day_imp_lm_temp[complete.cases(AQ_city_day),]

## Clean the Airport Delay data too
New_Airport_delay <- Airport_delay[complete.cases(Airport_delay),]

Detect outliers on Datasets

Since one of the hypothesis we have here is that extreme weather conditions

will affect the flight traffic, we are really looking for outliers unlike

normal cases where we tend to avoid outliers

Exploratoray Analysis of Bangalore Weather Dataset

hist(x=New_Weather_Bangalore$tavg, main = "Bangalore Average Temparature")

## Data outside <20 and >30 are outliers for Bangalore average 

hist(x=New_Weather_Bangalore$tmin, main = "Bangalore Min Temparature")

## Data outside <16 are outliers for Bangalore min 

hist(x=New_Weather_Bangalore$tmax, main = "Bangalore Max  Temparature")

## Data outside >35 are outliers for Bangalore min 

hist(x=New_Weather_Bangalore$prcp, main = "Bangalore Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Bangalore <- New_Weather_Bangalore %>% filter((tavg < 20) | (tavg>30) | (tmin < 16) | (tmax > 35) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bangalore, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Bangalore$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 16 to 22
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bangalore <- Special_Weather_Bangalore %>% filter((tmin > 16) & (tmin < 22))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bangalore, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Bangalore$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Chennai Weather Dataset

hist(x=New_Weather_Chennai$tavg, main = "Chennai Average Temparature")

## Data outside <15 and >35 are outliers for Chennai average 

hist(x=New_Weather_Chennai$tmin, main = "Chennai Min Temparature")

## Data outside <16 are outliers for Chennai min 

hist(x=New_Weather_Chennai$tmax, main = "Chennai Max  Temparature")

## Data outside >35 are outliers for Chennai min 

hist(x=New_Weather_Chennai$prcp, main = "Chennai Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Chennai <- New_Weather_Chennai %>% filter((tavg < 15) | (tavg>35) | (tmin < 10) | (tmax > 30) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Chennai, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Chennai$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Chennai <- Special_Weather_Chennai %>% filter((tmin > 20) & (tmin < 30))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Chennai, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Chennai$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Delhi Weather Dataset

hist(x=New_Weather_Delhi$tavg, main = "Delhi Average Temparature")

## Data outside <15 and >35 are outliers for Delhi average 

hist(x=New_Weather_Delhi$tmin, main = "Delhi Min Temparature")

## Data outside <16 are outliers for Delhi min 

hist(x=New_Weather_Delhi$tmax, main = "Delhi Max  Temparature")

## Data outside >35 are outliers for Delhi min 

hist(x=New_Weather_Delhi$prcp, main = "Delhi Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Delhi <- New_Weather_Delhi %>% filter((tavg < 15) | (tavg>35) | (tmin < 10) | (tmax > 30) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Delhi, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Delhi$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Delhi <- Special_Weather_Delhi %>% filter((tmin > 20) & (tmin < 30))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Delhi, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Delhi$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Lucknow Weather Dataset

hist(x=New_Weather_Lucknow$tavg, main = "Lucknow Average Temparature")

## Data outside <16 and >33 are outliers for Lucknow average 

hist(x=New_Weather_Lucknow$tmin, main = "Lucknow Min Temparature")

## Data outside <15 are outliers for Lucknow min 

hist(x=New_Weather_Lucknow$tmax, main = "Lucknow Max  Temparature")

## Data outside >35 are outliers for Lucknow min 

hist(x=New_Weather_Lucknow$prcp, main = "Lucknow Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Lucknow <- New_Weather_Lucknow %>% filter((tavg < 16) | (tavg>33) | (tmin < 15) | (tmax > 30) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Lucknow, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Lucknow$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Lucknow <- Special_Weather_Lucknow %>% filter((tmin > 20) & (tmin < 30))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Lucknow, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Lucknow$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Mumbai Weather Dataset

hist(x=New_Weather_Mumbai$tavg, main = "Mumbai Average Temparature")

## Data outside <25 and >30 are outliers for Mumbai average 

hist(x=New_Weather_Mumbai$tmin, main = "Mumbai Min Temparature")

## Data outside <17 are outliers for Mumbai min 

hist(x=New_Weather_Mumbai$tmax, main = "Mumbai Max  Temparature")

## Data outside >35 are outliers for Mumbai min 

hist(x=New_Weather_Mumbai$prcp, main = "Mumbai Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Mumbai <- New_Weather_Mumbai %>% filter((tavg < 25) | (tavg>30) | (tmin < 17) | (tmax > 35) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Mumbai, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Mumbai$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Mumbai <- Special_Weather_Mumbai %>% filter((tmin > 22) & (tmin < 27))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Mumbai, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Mumbai$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Jodhpur Weather Dataset

hist(x=New_Weather_Jodhpur$tavg, main = "Jodhpur Average Temparature")

## Data outside <22 and >28 are outliers for Jodhpur average 

hist(x=New_Weather_Jodhpur$tmin, main = "Jodhpur Min Temparature")

## Data outside <16 are outliers for Jodhpur min 

hist(x=New_Weather_Jodhpur$tmax, main = "Jodhpur Max  Temparature")

## Data outside >33 are outliers for Jodhpur min 

hist(x=New_Weather_Jodhpur$prcp, main = "Jodhpur Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Jodhpur <- New_Weather_Jodhpur %>% filter((tavg < 22) | (tavg>28) | (tmin < 16) | (tmax > 33) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Jodhpur, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Jodhpur$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 23
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Jodhpur <- Special_Weather_Jodhpur %>% filter((tmin > 17) & (tmin < 23))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Jodhpur, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Jodhpur$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Bhubhenshwar Weather Dataset

hist(x=New_Weather_Bhubhneshwar$tavg, main = "Bhubhenshwar Average Temparature")

## Data outside <24 and >32 are outliers for Bhubhenshwar average 

hist(x=New_Weather_Bhubhneshwar$tmin, main = "Bhubhenshwar Min Temparature")

## Data outside <15 are outliers for Bhubhenshwar min 

hist(x=New_Weather_Bhubhneshwar$tmax, main = "Bhubhenshwar Max  Temparature")

## Data outside >35 are outliers for Bhubhenshwar min 

hist(x=New_Weather_Bhubhneshwar$prcp, main = "Bhubhenshwar Precipitation", breaks = 5)

## Extreme cases are above 50

## So lets make special dataset
Special_Weather_Bhubhenshwar <- New_Weather_Bhubhneshwar %>% filter((tavg < 24) | (tavg>32) | (tmin < 15) | (tmax > 35) | (prcp > 50))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bhubhenshwar, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Bhubhenshwar$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bhubhenshwar <- Special_Weather_Bhubhenshwar %>% filter((tmin > 17) & (tmin < 27))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bhubhenshwar, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Bhubhenshwar$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Rourkela Weather Dataset

hist(x=New_Weather_Rourkela$tavg, main = "Rourkela Average Temparature")

## Data outside <20 and >32 are outliers for Rourkela average 

hist(x=New_Weather_Rourkela$tmin, main = "Rourkela Min Temparature")

## Data outside <15 are outliers for Rourkela min 

hist(x=New_Weather_Rourkela$tmax, main = "Rourkela Max  Temparature")

## Data outside >35 are outliers for Rourkela min 

hist(x=New_Weather_Rourkela$prcp, main = "Rourkela Precipitation", breaks = 5)

## Extreme cases are above 40

## So lets make special dataset
Special_Weather_Rourkela <- New_Weather_Rourkela %>% filter((tavg < 20) | (tavg>32) | (tmin < 15) | (tmax > 30) | (prcp > 40))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Rourkela, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Special_Weather_Rourkela$prcp/50) +
  labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Rourkela <- Special_Weather_Rourkela %>% filter((tmin > 22) & (tmin < 27))

## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Rourkela, aes(x = tmin, 
                     y = tmax, 
                     color = prcp)) +
  geom_point(size = Ext_Special_Weather_Rourkela$prcp/75) +
  labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of AQI data station wise

head(New_AQ_station_hour)
## # A tibble: 6 × 33
##   StationId Datetime       PM2.5  PM10    NO   NO2   NOx   NH3    CO   SO2    O3
##   <chr>     <chr>          <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001     2017-11-25 09… 104    148.  1.93  23   13.8   9.8    0.1  15.3  118.
## 2 AP001     2017-11-25 10…  94.5  142   1.33  16.2  9.75  9.65   0.1  17    136.
## 3 AP001     2017-11-25 11…  82.8  126.  1.47  14.8  9.07  9.7    0.1  15.4  150.
## 4 AP001     2017-11-25 14…  68.5  117   1.35  13.6  8.35  7.4    0.1  21.8  162.
## 5 AP001     2017-11-25 15…  69.2  112.  1.52  11.8  7.55  9.25   0.1  21.4  162.
## 6 AP001     2017-11-25 16…  70    107   2.8   30.3 18.4   6.15   0.1  18.9  148.
## # ℹ 22 more variables: Benzene <dbl>, Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## #   AQI_Bucket <chr>, StationId_NA <fct>, Datetime_NA <fct>, PM2.5_NA <fct>,
## #   PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>,
## #   CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## #   Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>
# Lets see the performance of the AQI over years
AQ_station_Day_Sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))

AQ_station_Day_Duration <- AQ_station_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))

AQI_Over_Years <- AQ_station_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+  geom_line()

## It appears that 'Severe' and 'Poor' cases didn't exist much  until 2017 from which these
## two gained at the behest of 'Good' AQI cases

# Lets see the performance of the AQI over a day in every year

AQI_Over_Time <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that the year 2017 had witnessed the worst air quality index but much of that
## was during the day time. Things slowed down in the years later but in them, 
## but the pattern changed by having night time pollution as the worst.
## In all cases, early morning pollution was the lowest.


# Lets see the performance of the AQI monthwise
AQI_monthwise <- AQ_station_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that the colder months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season.

AQI_Over_month <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see the same trend every year - i.e., the colder months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values

## Now lets report this city wise - probably for the Month wise combination
AQI_Stationwise <- AQ_station_Day_Duration %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Stationwise, aes(x = Month, y = Mean_AQI, color = Station))+  geom_point(shape = AQI_Stationwise$Duration)

## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.

## Now out of the 19 stations, we are very interested on just interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_Station <- AQ_station_Day_Duration %>% filter( (StationId == "DL001") | (StationId == "DL019")) %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Delhi_Station, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_Delhi_Station$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

# Lets see how AQ day data is different from station hour wise data
New_AQ_station_day_Years <- New_AQ_station_day%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
head(AQ_station_Day_Duration)
## # A tibble: 6 × 38
##   StationId Date     Hr    Min   Sec   PM2.5  PM10    NO   NO2   NOx   NH3    CO
##   <chr>     <chr>    <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AP001     2017-11… 09    00    00    104    148.  1.93  23   13.8   9.8    0.1
## 2 AP001     2017-11… 10    00    00     94.5  142   1.33  16.2  9.75  9.65   0.1
## 3 AP001     2017-11… 11    00    00     82.8  126.  1.47  14.8  9.07  9.7    0.1
## 4 AP001     2017-11… 14    00    00     68.5  117   1.35  13.6  8.35  7.4    0.1
## 5 AP001     2017-11… 15    00    00     69.2  112.  1.52  11.8  7.55  9.25   0.1
## 6 AP001     2017-11… 16    00    00     70    107   2.8   30.3 18.4   6.15   0.1
## # ℹ 26 more variables: SO2 <dbl>, O3 <dbl>, Benzene <dbl>, Toluene <dbl>,
## #   Xylene <dbl>, AQI <dbl>, AQI_Bucket <chr>, StationId_NA <fct>,
## #   Datetime_NA <fct>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>,
## #   NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>,
## #   O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>,
## #   AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>, Hour <dbl>,
## #   Duration <fct>
## There seems to be nothing new that we can derive out of the station day wise that we can't derive out of 
## station hour wise data. so no further analysis needed over here

Exploratory Analysis of AQI data city wise

## Lets look at City wise hourly AQI data
head(New_AQ_city_hour)
## # A tibble: 6 × 33
##   City    Datetime PM2.5  PM10    NO   NO2   NOx   NH3    CO   SO2    O3 Benzene
##   <chr>   <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Amarav… 2017-11… 104    148.  1.93  23   13.8   9.8    0.1  15.3  118.    0.3 
## 2 Amarav… 2017-11…  94.5  142   1.33  16.2  9.75  9.65   0.1  17    136.    0.28
## 3 Amarav… 2017-11…  82.8  126.  1.47  14.8  9.07  9.7    0.1  15.4  150.    0.2 
## 4 Amarav… 2017-11…  68.5  117   1.35  13.6  8.35  7.4    0.1  21.8  162.    0.1 
## 5 Amarav… 2017-11…  69.2  112.  1.52  11.8  7.55  9.25   0.1  21.4  162.    0.1 
## 6 Amarav… 2017-11…  70    107   2.8   30.3 18.4   6.15   0.1  18.9  148.    0.1 
## # ℹ 21 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## #   AQI_Bucket <chr>, City_NA <fct>, Datetime_NA <fct>, PM2.5_NA <fct>,
## #   PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>,
## #   CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## #   Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>
# Lets see the performance of the AQI over years
AQ_city_Day_Sep <- New_AQ_city_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))

AQ_city_Day_Duration <- AQ_city_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))

## Now get it grouped by Year and plot year wise performance
AQI_City_Over_Years <- AQ_city_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_City_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+  geom_line()

AQI_City_Over_Time <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_City_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_City_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## It appears 2015 had peak values of AQIs, which dropped to very low in 2016, gained to half the levels back in 2017 and then gradually reducing
## We can see that 2015-2017 worst was during day time but from 2018, there were worse night times - may be something to do with dropped levels of AQIs as well
## In all cases, early morning pollution seems to be the lowest.


# Lets see the performance of the AQI month wise
AQI_City_monthwise <- AQ_city_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_City_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_City_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that the winter months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season. The difference between stationwise data is that, here Nov seems to be the worst month while in ther other dataset, Dec held the worst...

AQI_City_Over_month <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
AQI_City_Over_month
## # A tibble: 185 × 4
## # Groups:   YEAR, Month [63]
##     YEAR Month Duration      Mean_AQI
##    <dbl> <ord> <fct>            <dbl>
##  1  2015 Jan   Early_Morning     343.
##  2  2015 Jan   Day               341.
##  3  2015 Jan   Night             341.
##  4  2015 Feb   Early_Morning     329.
##  5  2015 Feb   Day               329.
##  6  2015 Feb   Night             325.
##  7  2015 Mar   Early_Morning     249.
##  8  2015 Mar   Day               262.
##  9  2015 Mar   Night             254.
## 10  2015 Apr   Early_Morning     304.
## # ℹ 175 more rows
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_City_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+  geom_point(size = AQI_City_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see the same trend every year - i.e., the winter months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values

## Now lets report this city wise - probably for the Month wise combination
AQI_Citywise <- AQ_city_Day_Sep %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Citywise, aes(x = Month, y = Mean_AQI))+  geom_point(aes(color=City))

## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.

## Now out of all the cities, we are very interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_City <- AQ_city_Day_Sep %>% filter( City == "Delhi") %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Delhi_City, aes(x = Month, y = Mean_AQI))+  geom_point() + facet_wrap(~YEAR)

New_AQ_city_day_Years <- New_AQ_city_day%>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
head(New_AQ_city_day_Years)
## # A tibble: 6 × 4
## # Groups:   City, YEAR [2]
##   City       YEAR Month Mean_AQI
##   <chr>     <dbl> <ord>    <dbl>
## 1 Amaravati  2017 Nov      184. 
## 2 Amaravati  2017 Dec      194. 
## 3 Amaravati  2018 Jan      172. 
## 4 Amaravati  2018 Feb      107. 
## 5 Amaravati  2018 Mar       84.6
## 6 Amaravati  2018 Apr       63.8
ggplot(New_AQ_city_day_Years, aes(x = Month, y = Mean_AQI))+  geom_point(aes(color=City))

## There seems to be small difference when comparing hour wise data to day wise data, but not significant enough. So we will use mainly  the hour wise datafor citiwise analysis.

Model for the prediction of AQI index

## We would like to understand which of the parameters are really affecting AQI value.
## Based on the analysis above we will stick to using the Cleaned Station hour wise datasets.

New_AQ_station_hour_sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr), Month = month(ymd(Date)))

## Now lets focus on the months where we have the most troubles with AQI - Oct to Feb
New_AQ_station_hour_sep_BM <- New_AQ_station_hour_sep %>% filter ((Month == 1) | (Month == 2) | (Month == 10) | (Month == 11) | (Month == 12))

AQI_O3_model <- lm(AQI~O3, data = New_AQ_station_hour_sep)
fmodel(AQI_O3_model)

## OK vow, looks like AQI has direct relationship with the O3 content

AQI_O3_model_BM <- lm(AQI~O3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_O3_model_BM)

## In bad months looks like O3 and AQI are inversely proportional

## Lets try with PM2.5
AQI_PM_2_5_model <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_2_5_model)

## OK even here there is an impact - actually much more
AQI_PM_2_5_model_BM <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_2_5_model_BM)

## PM2.5 impact seems to be much higher over the winter months

##Lets try others
AQI_PM_10_model <- lm(AQI~PM10, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_10_model)

AQI_PM_10_model_BM <- lm(AQI~PM10, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_10_model_BM)

## No significant impact change in winter months for PM10

AQI_NO_model <- lm(AQI~NO, data = New_AQ_station_hour_sep)
fmodel(AQI_NO_model)

AQI_NO_model_BM <- lm(AQI~NO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO_model_BM)

## Slight reduction in winter months for NO

AQI_NO2_model <- lm(AQI~NO2, data = New_AQ_station_hour_sep)
fmodel(AQI_NO2_model)

AQI_NO2_model_BM <- lm(AQI~NO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO2_model_BM)

## No significant impact change in winter months for NO2

AQI_NOx_model <- lm(AQI~NOx, data = New_AQ_station_hour_sep)
fmodel(AQI_NOx_model)

AQI_NOx_model_BM <- lm(AQI~NOx, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NOx_model_BM)

## Slight reduction in winter months for NOx

AQI_NH3_model <- lm(AQI~NH3, data = New_AQ_station_hour_sep)
fmodel(AQI_NH3_model)

AQI_NH3_model_BM <- lm(AQI~NH3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NH3_model_BM)

## PM2.5 impact seems to be much higher (50% more) over the winter months

AQI_CO_model <- lm(AQI~CO, data = New_AQ_station_hour_sep)
fmodel(AQI_CO_model)

AQI_CO_model_BM <- lm(AQI~CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_CO_model_BM)

## No significant impact change in winter months for CO

AQI_SO2_model <- lm(AQI~SO2, data = New_AQ_station_hour_sep)
fmodel(AQI_SO2_model)

AQI_SO2_model_BM <- lm(AQI~SO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_SO2_model_BM)

## Slight reduction in winter months for SO2

AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep)
fmodel(AQI_Benzene_model)

AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Benzene_model)

## Slight reduction in winter months for Benzene

AQI_Toluene_model <- lm(AQI~Toluene, data = New_AQ_station_hour_sep)
fmodel(AQI_Toluene_model)

AQI_Toluene_model_BM <- lm(AQI~Toluene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Toluene_model_BM)

## Slight reduction in winter months for Toulene

AQI_Xylene_model <- lm(AQI~Xylene, data = New_AQ_station_hour_sep)
fmodel(AQI_Xylene_model)

AQI_Xylene_model_BM <- lm(AQI~Xylene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Xylene_model_BM)

## No significant impact change in winter months for Xylene

## Among these, the highest impact seems to be from CO. Bringing in
## O3 due to their peculiar reversal in Winter months
AQI_High_Impact_model <- lm(AQI~O3+CO, data = New_AQ_station_hour_sep)
fmodel(AQI_High_Impact_model)

AQI_High_Impact_model_BM <- lm(AQI~O3+CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_High_Impact_model_BM)

evaluate_model(AQI_High_Impact_model)
##    O3  CO model_output
## 1   0 0.0     67.22385
## 2  50 0.0     88.07392
## 3 100 0.0    108.92398
## 4   0 0.5     95.43036
## 5  50 0.5    116.28043
## 6 100 0.5    137.13050
## 7   0 1.0    123.63687
## 8  50 1.0    144.48694
## 9 100 1.0    165.33701
evaluate_model(AQI_High_Impact_model_BM)
##    O3 CO model_output
## 1   0  0     108.7309
## 2  50  0     116.8346
## 3 100  0     124.9383
## 4   0  1     157.0225
## 5  50  1     165.1262
## 6 100  1     173.2299
## 7   0  2     205.3141
## 8  50  2     213.4178
## 9 100  2     221.5215
## Defintely bad months brings in a lot of diffeence into the data set.
## So lets consider even months as one explanatory variables
New_AQ_station_hour_sep
## # A tibble: 203,693 × 38
##    StationId Date    Hr    Min   Sec   PM2.5  PM10    NO   NO2   NOx   NH3    CO
##    <chr>     <chr>   <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AP001     2017-1… 09    00    00    104    148.  1.93  23   13.8   9.8    0.1
##  2 AP001     2017-1… 10    00    00     94.5  142   1.33  16.2  9.75  9.65   0.1
##  3 AP001     2017-1… 11    00    00     82.8  126.  1.47  14.8  9.07  9.7    0.1
##  4 AP001     2017-1… 14    00    00     68.5  117   1.35  13.6  8.35  7.4    0.1
##  5 AP001     2017-1… 15    00    00     69.2  112.  1.52  11.8  7.55  9.25   0.1
##  6 AP001     2017-1… 16    00    00     70    107   2.8   30.3 18.4   6.15   0.1
##  7 AP001     2017-1… 17    00    00     72.8  120.  1.5   26.7 15.4  10.8    0.1
##  8 AP001     2017-1… 18    00    00     81.5  135.  1.1   18.8 10.9  14.7    0.1
##  9 AP001     2017-1… 19    00    00     85    142.  1.62  26.2 15.3  14.5    0.2
## 10 AP001     2017-1… 20    00    00     91.5  146.  0.98  18.9 10.8  14.1    0.2
## # ℹ 203,683 more rows
## # ℹ 26 more variables: SO2 <dbl>, O3 <dbl>, Benzene <dbl>, Toluene <dbl>,
## #   Xylene <dbl>, AQI <dbl>, AQI_Bucket <chr>, StationId_NA <fct>,
## #   Datetime_NA <fct>, PM2.5_NA <fct>, PM10_NA <fct>, NO_NA <fct>,
## #   NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>, CO_NA <fct>, SO2_NA <fct>,
## #   O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>, Xylene_NA <fct>,
## #   AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>, Hour <dbl>, …
AQI_High_Impact_model_Month <- lm(AQI~O3+CO+month(ymd(Date)), data = New_AQ_station_hour_sep)
fmodel(AQI_High_Impact_model_Month)

evaluate_model(AQI_High_Impact_model_Month)
##     O3  CO       Date model_output
## 1    0 0.0 2020-03-10     65.46391
## 2   50 0.0 2020-03-10     86.35594
## 3  100 0.0 2020-03-10    107.24797
## 4    0 0.5 2020-03-10     93.61949
## 5   50 0.5 2020-03-10    114.51152
## 6  100 0.5 2020-03-10    135.40355
## 7    0 1.0 2020-03-10    121.77507
## 8   50 1.0 2020-03-10    142.66710
## 9  100 1.0 2020-03-10    163.55913
## 10   0 0.0 2020-03-07     65.46391
## 11  50 0.0 2020-03-07     86.35594
## 12 100 0.0 2020-03-07    107.24797
## 13   0 0.5 2020-03-07     93.61949
## 14  50 0.5 2020-03-07    114.51152
## 15 100 0.5 2020-03-07    135.40355
## 16   0 1.0 2020-03-07    121.77507
## 17  50 1.0 2020-03-07    142.66710
## 18 100 1.0 2020-03-07    163.55913
## 19   0 0.0 2020-03-08     65.46391
## 20  50 0.0 2020-03-08     86.35594
## 21 100 0.0 2020-03-08    107.24797
## 22   0 0.5 2020-03-08     93.61949
## 23  50 0.5 2020-03-08    114.51152
## 24 100 0.5 2020-03-08    135.40355
## 25   0 1.0 2020-03-08    121.77507
## 26  50 1.0 2020-03-08    142.66710
## 27 100 1.0 2020-03-08    163.55913
## Having month as part of the model really makes a difference to the evaluation.

## Now lets train the model and see if we can predict the values of AQI
#make this split reproducible
set.seed(1)

#Use 70% of dataset as training set and remaining 30% as testing set
AQI_sample_set <- sample(c(TRUE, FALSE), nrow(New_AQ_station_hour_sep), replace=TRUE, prob=c(0.7,0.3))
AQI_train_dataset  <- New_AQ_station_hour_sep[AQI_sample_set, ]
AQI_test_dataset   <- New_AQ_station_hour_sep[!AQI_sample_set, ]

AQI_Eval_model = lm(AQI~O3+CO+month(ymd(Date)), data = AQI_train_dataset)
summary(AQI_Eval_model)
## 
## Call:
## lm(formula = AQI ~ O3 + CO + month(ymd(Date)), data = AQI_train_dataset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2339.28   -41.21   -14.55    18.77   728.61 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      63.602092   0.464663  136.88   <2e-16 ***
## O3                0.419003   0.006315   66.35   <2e-16 ***
## CO               56.352626   0.294424  191.40   <2e-16 ***
## month(ymd(Date))  0.576123   0.048342   11.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.87 on 142592 degrees of freedom
## Multiple R-squared:  0.213,  Adjusted R-squared:  0.213 
## F-statistic: 1.286e+04 on 3 and 142592 DF,  p-value: < 2.2e-16
Predicted_AQI_Values <- predict(AQI_Eval_model, AQI_test_dataset)

AQI_test_dataset["Predicted_AQI"] <- Predicted_AQI_Values

Summary_AQI_Model_Performace <- AQI_test_dataset %>% group_by(YEAR = year(ymd(Date)), Month) %>% summarise(AQI, Predicted_AQI)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
Summary_AQI_Model_Performace
## # A tibble: 61,097 × 4
## # Groups:   YEAR, Month [58]
##     YEAR Month   AQI Predicted_AQI
##    <dbl> <dbl> <dbl>         <dbl>
##  1  2015     9    61          95.8
##  2  2015     9    61         117. 
##  3  2015     9    72         126. 
##  4  2015     9    72          95.4
##  5  2015     9    77         119. 
##  6  2015     9    66         129. 
##  7  2015     9    66         102. 
##  8  2015     9    66         108. 
##  9  2015     9    62          96.7
## 10  2015     9    62         106. 
## # ℹ 61,087 more rows
ggplot(Summary_AQI_Model_Performace, aes(x = Month)) +
        geom_point(aes(y = AQI, color = 'AQI')) +
        geom_point(aes(y = Predicted_AQI, color = 'Predictede_AQI')) +
         scale_x_continuous(breaks=seq(1, 12, by = 1))+
  labs(title = "AQI Model Performance")  + facet_wrap(~YEAR)

## We can see that there are a good amount overlaps between the AQI prediction vs actual data though
## there is still a very large scope of improvement of the model - esp when dealing with outliers.
## But so far we have sufficient proof available that AQI is heavily influenced by 
## month of the year and quantities of O3 and CO.

Form a cohesive Delhi dataset

## We have seen how components of air impacted AQI
## Time to see the impact of weather on AQI by merging the station day wise data with the weather data
## Please note we are not picking up station hour wise data because the weather data we have is only day wise data

## Out of the cities for which weather has been provided, the only city that overlaps with the AQI data is Delhi
## And ofcourse we are trying to find the impact of AQI on Airtraffic in Delhi, so lets bring in that too
## So lets merge these three datasets only for Delhi
Delhi_AQI_data_temp <- New_AQ_station_day %>% filter ((StationId == "DL001") | (StationId == "DL019"))%>% mutate(Date_1 = ymd(as.Date(Date)))
Delhi_AQI_data <- Delhi_AQI_data_temp[, -2] %>% rename("Date" = "Date_1")
Delhi_AQI_data
## # A tibble: 1,126 × 33
##    StationId PM2.5  PM10    NO   NO2   NOx   NH3    CO   SO2    O3 Benzene
##    <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
##  1 DL001     238.   349.  3.25  79.0  44.6  36.0  1.53  14.4  45.3    6.64
##  2 DL001     285.   427. 20.0  113.   76.2  41.2  1.43  21.3  30.4    8.8 
##  3 DL001     150.   214.  6.35  96.2  56.3  36.0  0.99  12.7  41.9    5.64
##  4 DL001     106.   154.  7.36  77.5  47.2  30.4  0.7   12.5  27.9    4.2 
##  5 DL001     146.   211.  2.01  60.3  33.6  28.5  0.71  11.6  46.1    2.61
##  6 DL001     167.   272.  9.92  80.4  50.8  29.9  0.98  14    33.5    3.84
##  7 DL001     219.   347. 23.9   99.6  72.5  35.0  1.33  19.8  44.8    5.56
##  8 DL001     227.   356.  2.1   77.9  43.2  37.1  1.02  21.2  52.6    4.49
##  9 DL001      86.5  206.  2.71  58.0  33.0  25.6  0.59  18.2  39.4    2.33
## 10 DL001     150.   287. 23.2   70.8  56.6  28.2  1     20.7  34.0    3.84
## # ℹ 1,116 more rows
## # ℹ 22 more variables: Toluene <dbl>, Xylene <dbl>, AQI <dbl>,
## #   AQI_Bucket <chr>, StationId_NA <fct>, Date_NA <fct>, PM2.5_NA <fct>,
## #   PM10_NA <fct>, NO_NA <fct>, NO2_NA <fct>, NOx_NA <fct>, NH3_NA <fct>,
## #   CO_NA <fct>, SO2_NA <fct>, O3_NA <fct>, Benzene_NA <fct>, Toluene_NA <fct>,
## #   Xylene_NA <fct>, AQI_NA <fct>, AQI_Bucket_NA <fct>, any_missing <chr>,
## #   Date <date>
New_Weather_Delhi_day <- New_Weather_Delhi %>% mutate(Date = dmy(time))

Delhi_Airport_Delay_date <- New_Airport_delay %>% filter (Departure.Airport == "DEL") %>% mutate(Date_1 = dmy(Date))
Delhi_Airport_Delay_rename <- Delhi_Airport_Delay_date[, -1] %>% rename("Date" = "Date_1")
Delhi_Airport_Delay_date_sorted <- Delhi_Airport_Delay_rename[order(Delhi_Airport_Delay_rename$Date),]

## The range of weather data is from 01/01/1990 to 25/07/2022
## The range of airport delay data is from 28/01/18 to 27/1/2020

## So the overlapping range is from 1/11/2018 to 26/1/2020

Delhi_Airport_Delay_range <- Delhi_Airport_Delay_date_sorted %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))
#Delhi_Airport_Delay <- Delhi_Airport_Delay_dates %>% filter ((Date >'25-01-18') & (Date < '29-01-20'))  #1925
New_Weather_Delhi_day_range <- New_Weather_Delhi_day %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))

Delhi_AQI_data_range <- Delhi_AQI_data %>% filter (between(Date, as.Date('2018-01-25'), as.Date('2020-01-27')))

##Delhi_Airport_Delay data has multiple entries for a day as it is cutting across many airliners operating on an airport. But we are interested in average delay per day and not really on the airliner related information. So, lets clean the data a bit there.

convert_min <- function(x)
{
  if(x < 0)
  {
    time_mins = 0
  }
  else
  {
    time_d <- hms(x)
    time_mins <- hour(time_d)*60 + minute(time_d)
  }
}

Delhi_Airport_Delay_in_min <- Delhi_Airport_Delay_range %>% mutate (Departure_Delay_min = unlist(lapply(Departure.Delay, convert_min)), Arrival_Delay_min = unlist(lapply(Arrival.Time.Delay, convert_min)))
Delhi_Airport_Delay_datewise <- Delhi_Airport_Delay_in_min  %>% group_by(Date) %>% summarize(Daily_Delay = sum(Departure_Delay_min + Arrival_Delay_min))

Delhi_AQI_weather_data_merge_temp <- merge(New_Weather_Delhi_day, Delhi_AQI_data)
Delhi_AQI_weather_data_merge_temp_1 <- Delhi_AQI_weather_data_merge_temp[,-3]
Delhi_cohesive_dataset <- merge(Delhi_AQI_weather_data_merge_temp_1, Delhi_Airport_Delay_datewise)%>% mutate(Month = month(ymd(Date)))
Delhi_cohesive_dataset
##           Date any_missing tavg tmin tmax prcp time_NA tavg_NA tmin_NA tmax_NA
## 1   2018-06-27 Not Missing 30.3 26.2 37.5  3.0     !NA     !NA     !NA     !NA
## 2   2018-06-28 Not Missing 29.9 24.2 37.5 20.1     !NA     !NA     !NA     !NA
## 3   2018-06-29 Not Missing 30.7 27.9 35.2  1.0     !NA     !NA     !NA     !NA
## 4   2018-06-30 Not Missing 31.3 27.5 35.6  9.9     !NA     !NA     !NA     !NA
## 5   2018-07-04 Not Missing 31.7 26.1 36.7  5.1     !NA     !NA     !NA     !NA
## 6   2018-07-06 Not Missing 32.9 28.1 37.3  5.1     !NA     !NA     !NA     !NA
## 7   2018-07-07 Not Missing 33.7 28.2 39.1  0.0     !NA     !NA     !NA     !NA
## 8   2018-07-12 Not Missing 32.4 28.6 36.7  0.0     !NA     !NA     !NA     !NA
## 9   2018-07-14 Not Missing 30.3 25.4 34.2 22.1     !NA     !NA     !NA     !NA
## 10  2018-07-15 Not Missing 31.8 27.6 36.3  7.1     !NA     !NA     !NA     !NA
## 11  2018-07-16 Not Missing 29.8 28.1 35.4  0.0     !NA     !NA     !NA     !NA
## 12  2018-07-17 Not Missing 32.6 28.0 37.1 24.9     !NA     !NA     !NA     !NA
## 13  2018-07-18 Not Missing 32.9 28.6 37.2  0.0     !NA     !NA     !NA     !NA
## 14  2018-07-19 Not Missing 32.8 28.8 37.3  0.8     !NA     !NA     !NA     !NA
## 15  2018-07-20 Not Missing 29.2 27.1 37.8 23.9     !NA     !NA     !NA     !NA
## 16  2018-07-21 Not Missing 29.2 27.4 33.4  0.0     !NA     !NA     !NA     !NA
## 17  2018-07-22 Not Missing 29.0 26.0 34.4 18.0     !NA     !NA     !NA     !NA
## 18  2018-07-23 Not Missing 30.7 26.2 34.6 70.1     !NA     !NA     !NA     !NA
## 19  2018-07-24 Not Missing 32.4 28.2 36.4  0.0     !NA     !NA     !NA     !NA
## 20  2018-07-26 Not Missing 26.9 26.2 32.2  5.1     !NA     !NA     !NA     !NA
## 21  2018-07-27 Not Missing 26.7 25.2 28.7 46.0     !NA     !NA     !NA     !NA
## 22  2018-07-28 Not Missing 28.2 25.4 31.8  3.0     !NA     !NA     !NA     !NA
## 23  2018-07-29 Not Missing 30.3 26.0 34.4  5.1     !NA     !NA     !NA     !NA
## 24  2018-07-30 Not Missing 30.9 23.0 34.8 24.9     !NA     !NA     !NA     !NA
## 25  2018-07-31 Not Missing 30.0 27.4 34.5  0.0     !NA     !NA     !NA     !NA
## 26  2018-08-06 Not Missing 28.1 26.6 33.2  3.0     !NA     !NA     !NA     !NA
## 27  2018-08-07 Not Missing 28.2 26.4 31.8  2.0     !NA     !NA     !NA     !NA
## 28  2018-08-08 Not Missing 30.4 26.5 33.4  4.1     !NA     !NA     !NA     !NA
## 29  2018-08-09 Not Missing 32.0 27.6 35.4  0.5     !NA     !NA     !NA     !NA
## 30  2018-11-03 Not Missing 25.2 18.5 31.6  0.3     !NA     !NA     !NA     !NA
## 31  2018-11-04 Not Missing 24.4 18.0 30.6  0.0     !NA     !NA     !NA     !NA
## 32  2018-11-14 Not Missing 22.2 17.6 29.4  7.1     !NA     !NA     !NA     !NA
## 33  2018-11-14 Not Missing 22.2 17.6 29.4  7.1     !NA     !NA     !NA     !NA
## 34  2018-11-15 Not Missing 22.2 16.2 28.5  1.0     !NA     !NA     !NA     !NA
## 35  2018-11-15 Not Missing 22.2 16.2 28.5  1.0     !NA     !NA     !NA     !NA
## 36  2018-11-16 Not Missing 21.1 14.5 27.5  0.0     !NA     !NA     !NA     !NA
## 37  2018-11-16 Not Missing 21.1 14.5 27.5  0.0     !NA     !NA     !NA     !NA
## 38  2018-12-12 Not Missing 17.2 12.6 22.3  0.5     !NA     !NA     !NA     !NA
## 39  2018-12-12 Not Missing 17.2 12.6 22.3  0.5     !NA     !NA     !NA     !NA
## 40  2018-12-13 Not Missing 16.4 11.2 21.6  1.0     !NA     !NA     !NA     !NA
## 41  2018-12-13 Not Missing 16.4 11.2 21.6  1.0     !NA     !NA     !NA     !NA
## 42  2018-12-14 Not Missing 15.7  8.0 21.4  0.0     !NA     !NA     !NA     !NA
## 43  2018-12-14 Not Missing 15.7  8.0 21.4  0.0     !NA     !NA     !NA     !NA
## 44  2019-01-06 Not Missing 13.9  8.6 21.2  2.0     !NA     !NA     !NA     !NA
## 45  2019-01-06 Not Missing 13.9  8.6 21.2  2.0     !NA     !NA     !NA     !NA
## 46  2019-01-21 Not Missing 16.4 11.5 28.7  3.0     !NA     !NA     !NA     !NA
## 47  2019-01-21 Not Missing 16.4 11.5 28.7  3.0     !NA     !NA     !NA     !NA
## 48  2019-01-22 Not Missing 14.5 13.6 22.6 15.0     !NA     !NA     !NA     !NA
## 49  2019-01-22 Not Missing 14.5 13.6 22.6 15.0     !NA     !NA     !NA     !NA
## 50  2019-01-23 Not Missing 13.8  8.0 19.4 27.9     !NA     !NA     !NA     !NA
## 51  2019-01-23 Not Missing 13.8  8.0 19.4 27.9     !NA     !NA     !NA     !NA
## 52  2019-01-24 Not Missing 14.8  9.7 21.0  0.0     !NA     !NA     !NA     !NA
## 53  2019-01-24 Not Missing 14.8  9.7 21.0  0.0     !NA     !NA     !NA     !NA
## 54  2019-01-25 Not Missing 13.4 11.0 22.0  6.1     !NA     !NA     !NA     !NA
## 55  2019-01-25 Not Missing 13.4 11.0 22.0  6.1     !NA     !NA     !NA     !NA
## 56  2019-01-26 Not Missing 12.2  6.0 19.0  0.0     !NA     !NA     !NA     !NA
## 57  2019-01-26 Not Missing 12.2  6.0 19.0  0.0     !NA     !NA     !NA     !NA
## 58  2019-02-01 Not Missing 16.3 11.0 22.0  0.0     !NA     !NA     !NA     !NA
## 59  2019-02-01 Not Missing 16.3 11.0 22.0  0.0     !NA     !NA     !NA     !NA
## 60  2019-02-02 Not Missing 12.9  8.2 22.5  0.0     !NA     !NA     !NA     !NA
## 61  2019-02-02 Not Missing 12.9  8.2 22.5  0.0     !NA     !NA     !NA     !NA
## 62  2019-02-06 Not Missing 19.5 12.0 25.0  0.8     !NA     !NA     !NA     !NA
## 63  2019-02-06 Not Missing 19.5 12.0 25.0  0.8     !NA     !NA     !NA     !NA
## 64  2019-02-07 Not Missing 16.5 13.6 25.1  0.0     !NA     !NA     !NA     !NA
## 65  2019-02-07 Not Missing 16.5 13.6 25.1  0.0     !NA     !NA     !NA     !NA
## 66  2019-02-08 Not Missing 14.9  7.0 20.4  5.1     !NA     !NA     !NA     !NA
## 67  2019-02-08 Not Missing 14.9  7.0 20.4  5.1     !NA     !NA     !NA     !NA
## 68  2019-02-09 Not Missing 14.8  8.4 20.6  0.0     !NA     !NA     !NA     !NA
## 69  2019-02-09 Not Missing 14.8  8.4 20.6  0.0     !NA     !NA     !NA     !NA
## 70  2019-02-14 Not Missing 17.7 13.9 25.3  1.0     !NA     !NA     !NA     !NA
## 71  2019-02-14 Not Missing 17.7 13.9 25.3  1.0     !NA     !NA     !NA     !NA
## 72  2019-02-15 Not Missing 17.6 11.8 23.0  9.9     !NA     !NA     !NA     !NA
## 73  2019-02-15 Not Missing 17.6 11.8 23.0  9.9     !NA     !NA     !NA     !NA
## 74  2019-02-16 Not Missing 16.0 13.3 22.0  0.0     !NA     !NA     !NA     !NA
## 75  2019-02-16 Not Missing 16.0 13.3 22.0  0.0     !NA     !NA     !NA     !NA
## 76  2019-02-19 Not Missing 18.7 12.0 25.8  2.0     !NA     !NA     !NA     !NA
## 77  2019-02-19 Not Missing 18.7 12.0 25.8  2.0     !NA     !NA     !NA     !NA
## 78  2019-02-20 Not Missing 18.3 14.5 24.5  2.0     !NA     !NA     !NA     !NA
## 79  2019-02-20 Not Missing 18.3 14.5 24.5  2.0     !NA     !NA     !NA     !NA
## 80  2019-02-21 Not Missing 21.6 13.2 28.0  0.8     !NA     !NA     !NA     !NA
## 81  2019-02-21 Not Missing 21.6 13.2 28.0  0.8     !NA     !NA     !NA     !NA
## 82  2019-02-22 Not Missing 19.1 15.4 28.1  0.0     !NA     !NA     !NA     !NA
## 83  2019-02-22 Not Missing 19.1 15.4 28.1  0.0     !NA     !NA     !NA     !NA
## 84  2019-02-26 Not Missing 15.3 10.3 25.4  1.0     !NA     !NA     !NA     !NA
## 85  2019-02-26 Not Missing 15.3 10.3 25.4  1.0     !NA     !NA     !NA     !NA
## 86  2019-02-27 Not Missing 14.3  9.5 21.0  0.0     !NA     !NA     !NA     !NA
## 87  2019-03-02 Not Missing 15.2 12.6 24.1  0.5     !NA     !NA     !NA     !NA
## 88  2019-03-02 Not Missing 15.2 12.6 24.1  0.5     !NA     !NA     !NA     !NA
## 89  2019-03-03 Not Missing 17.1 12.8 22.4  9.9     !NA     !NA     !NA     !NA
## 90  2019-03-03 Not Missing 17.1 12.8 22.4  9.9     !NA     !NA     !NA     !NA
## 91  2019-03-04 Not Missing 19.1 11.2 24.0  0.3     !NA     !NA     !NA     !NA
## 92  2019-03-04 Not Missing 19.1 11.2 24.0  0.3     !NA     !NA     !NA     !NA
## 93  2019-03-05 Not Missing 18.4 12.0 23.9  0.0     !NA     !NA     !NA     !NA
## 94  2019-03-05 Not Missing 18.4 12.0 23.9  0.0     !NA     !NA     !NA     !NA
## 95  2019-03-09 Not Missing 20.6 12.6 26.5  0.0     !NA     !NA     !NA     !NA
## 96  2019-03-09 Not Missing 20.6 12.6 26.5  0.0     !NA     !NA     !NA     !NA
## 97  2019-03-15 Not Missing 20.2 10.4 26.0  0.5     !NA     !NA     !NA     !NA
## 98  2019-03-15 Not Missing 20.2 10.4 26.0  0.5     !NA     !NA     !NA     !NA
## 99  2019-03-16 Not Missing 21.6 12.5 26.0  0.0     !NA     !NA     !NA     !NA
## 100 2019-03-16 Not Missing 21.6 12.5 26.0  0.0     !NA     !NA     !NA     !NA
## 101 2019-03-31 Not Missing 28.8 21.0 39.2  0.3     !NA     !NA     !NA     !NA
## 102 2019-04-01 Not Missing 27.7 17.6 34.3  0.0     !NA     !NA     !NA     !NA
## 103 2019-04-06 Not Missing 33.6 22.4 39.3  0.0     !NA     !NA     !NA     !NA
## 104 2019-04-07 Not Missing 30.8 24.0 37.8  6.1     !NA     !NA     !NA     !NA
## 105 2019-04-12 Not Missing 30.6 23.2 38.7  0.5     !NA     !NA     !NA     !NA
## 106 2019-04-13 Not Missing 31.4 21.6 38.0  0.3     !NA     !NA     !NA     !NA
## 107 2019-04-14 Not Missing 32.2 21.8 38.0  0.0     !NA     !NA     !NA     !NA
## 108 2019-04-16 Not Missing 26.0 20.5 40.0  1.0     !NA     !NA     !NA     !NA
## 109 2019-04-17 Not Missing 24.3 19.0 30.7  1.0     !NA     !NA     !NA     !NA
## 110 2019-04-18 Not Missing 25.8 17.1 31.0  0.5     !NA     !NA     !NA     !NA
## 111 2019-04-19 Not Missing 27.5 18.2 32.0  0.0     !NA     !NA     !NA     !NA
## 112 2019-05-03 Not Missing 32.6 24.0 41.0  0.0     !NA     !NA     !NA     !NA
## 113 2019-05-04 Not Missing 32.1 22.4 39.5  4.1     !NA     !NA     !NA     !NA
## 114 2019-05-11 Not Missing 33.9 24.4 41.2  0.0     !NA     !NA     !NA     !NA
## 115 2019-05-14 Not Missing 30.8 20.8 40.2  3.0     !NA     !NA     !NA     !NA
## 116 2019-05-15 Not Missing 29.4 23.0 35.0  0.0     !NA     !NA     !NA     !NA
## 117 2019-05-16 Not Missing 30.6 23.4 36.4  2.0     !NA     !NA     !NA     !NA
## 118 2019-05-17 Not Missing 29.6 21.6 37.0  0.0     !NA     !NA     !NA     !NA
## 119 2019-05-18 Not Missing 28.7 19.5 37.4 10.9     !NA     !NA     !NA     !NA
## 120 2019-05-19 Not Missing 31.9 23.2 37.0  0.0     !NA     !NA     !NA     !NA
## 121 2019-05-24 Not Missing 29.8 23.8 40.4  6.1     !NA     !NA     !NA     !NA
## 122 2019-05-25 Not Missing 32.5 24.5 37.0  0.0     !NA     !NA     !NA     !NA
## 123 2019-06-16 Not Missing 31.5 28.4 43.4  0.0     !NA     !NA     !NA     !NA
## 124 2019-06-17 Not Missing 30.3 25.6 36.3  0.5     !NA     !NA     !NA     !NA
## 125 2019-06-18 Not Missing 28.5 20.6 35.0 10.9     !NA     !NA     !NA     !NA
## 126 2019-06-19 Not Missing 32.2 24.2 37.0  0.0     !NA     !NA     !NA     !NA
## 127 2019-06-22 Not Missing 35.6 27.2 40.0  0.0     !NA     !NA     !NA     !NA
## 128 2019-07-04 Not Missing 30.9 28.6 39.4  0.0     !NA     !NA     !NA     !NA
## 129 2019-07-05 Not Missing 28.8 26.9 38.6  1.0     !NA     !NA     !NA     !NA
## 130 2019-07-05 Not Missing 28.8 26.9 38.6  1.0     !NA     !NA     !NA     !NA
## 131 2019-07-06 Not Missing 33.4 26.5 37.0 24.9     !NA     !NA     !NA     !NA
## 132 2019-07-06 Not Missing 33.4 26.5 37.0 24.9     !NA     !NA     !NA     !NA
## 133 2019-07-07 Not Missing 32.1 27.6 37.0  0.0     !NA     !NA     !NA     !NA
## 134 2019-07-07 Not Missing 32.1 27.6 37.0  0.0     !NA     !NA     !NA     !NA
## 135 2019-07-08 Not Missing 33.7 28.6 36.0  0.0     !NA     !NA     !NA     !NA
## 136 2019-07-08 Not Missing 33.7 28.6 36.0  0.0     !NA     !NA     !NA     !NA
## 137 2019-07-17 Not Missing 27.0 24.0 33.4 22.1     !NA     !NA     !NA     !NA
## 138 2019-07-17 Not Missing 27.0 24.0 33.4 22.1     !NA     !NA     !NA     !NA
## 139 2019-07-18 Not Missing 27.0 23.5 31.7 11.9     !NA     !NA     !NA     !NA
## 140 2019-07-18 Not Missing 27.0 23.5 31.7 11.9     !NA     !NA     !NA     !NA
## 141 2019-07-19 Not Missing 31.9 25.0 36.0  4.1     !NA     !NA     !NA     !NA
## 142 2019-07-19 Not Missing 31.9 25.0 36.0  4.1     !NA     !NA     !NA     !NA
## 143 2019-07-20 Not Missing 30.7 26.0 36.2  0.0     !NA     !NA     !NA     !NA
## 144 2019-07-20 Not Missing 30.7 26.0 36.2  0.0     !NA     !NA     !NA     !NA
## 145 2019-07-21 Not Missing 30.6 25.6 36.7  8.9     !NA     !NA     !NA     !NA
## 146 2019-07-21 Not Missing 30.6 25.6 36.7  8.9     !NA     !NA     !NA     !NA
## 147 2019-07-22 Not Missing 31.3 24.6 36.5 50.0     !NA     !NA     !NA     !NA
## 148 2019-07-22 Not Missing 31.3 24.6 36.5 50.0     !NA     !NA     !NA     !NA
## 149 2019-07-23 Not Missing 34.2 28.2 37.0  4.1     !NA     !NA     !NA     !NA
## 150 2019-07-23 Not Missing 34.2 28.2 37.0  4.1     !NA     !NA     !NA     !NA
## 151 2019-07-24 Not Missing 33.3 27.8 38.0  0.0     !NA     !NA     !NA     !NA
## 152 2019-07-24 Not Missing 33.3 27.8 38.0  0.0     !NA     !NA     !NA     !NA
## 153 2019-07-25 Not Missing 28.7 25.0 37.8 21.1     !NA     !NA     !NA     !NA
## 154 2019-07-25 Not Missing 28.7 25.0 37.8 21.1     !NA     !NA     !NA     !NA
## 155 2019-07-26 Not Missing 28.6 26.2 32.0  0.5     !NA     !NA     !NA     !NA
## 156 2019-07-26 Not Missing 28.6 26.2 32.0  0.5     !NA     !NA     !NA     !NA
## 157 2019-07-27 Not Missing 28.7 25.4 32.5 10.9     !NA     !NA     !NA     !NA
## 158 2019-07-27 Not Missing 28.7 25.4 32.5 10.9     !NA     !NA     !NA     !NA
## 159 2019-07-28 Not Missing 30.1 25.8 34.0  3.0     !NA     !NA     !NA     !NA
## 160 2019-07-28 Not Missing 30.1 25.8 34.0  3.0     !NA     !NA     !NA     !NA
## 161 2019-07-29 Not Missing 31.9 27.9 36.0  6.1     !NA     !NA     !NA     !NA
## 162 2019-07-29 Not Missing 31.9 27.9 36.0  6.1     !NA     !NA     !NA     !NA
## 163 2019-07-30 Not Missing 31.5 27.4 36.1  0.0     !NA     !NA     !NA     !NA
## 164 2019-07-30 Not Missing 31.5 27.4 36.1  0.0     !NA     !NA     !NA     !NA
## 165 2019-07-31 Not Missing 31.9 27.8 35.1  0.0     !NA     !NA     !NA     !NA
## 166 2019-07-31 Not Missing 31.9 27.8 35.1  0.0     !NA     !NA     !NA     !NA
## 167 2019-08-06 Not Missing 27.2 24.0 37.4 11.9     !NA     !NA     !NA     !NA
## 168 2019-08-06 Not Missing 27.2 24.0 37.4 11.9     !NA     !NA     !NA     !NA
## 169 2019-08-07 Not Missing 30.9 25.6 34.0 22.1     !NA     !NA     !NA     !NA
## 170 2019-08-07 Not Missing 30.9 25.6 34.0 22.1     !NA     !NA     !NA     !NA
## 171 2019-08-08 Not Missing 33.5 27.5 38.0  0.0     !NA     !NA     !NA     !NA
## 172 2019-08-08 Not Missing 33.5 27.5 38.0  0.0     !NA     !NA     !NA     !NA
## 173 2019-08-10 Not Missing 31.1 27.2 34.0  0.0     !NA     !NA     !NA     !NA
## 174 2019-08-10 Not Missing 31.1 27.2 34.0  0.0     !NA     !NA     !NA     !NA
## 175 2019-08-11 Not Missing 32.9 27.6 36.0  0.0     !NA     !NA     !NA     !NA
## 176 2019-08-11 Not Missing 32.9 27.6 36.0  0.0     !NA     !NA     !NA     !NA
## 177 2019-08-12 Not Missing 31.1 28.0 36.2  0.3     !NA     !NA     !NA     !NA
## 178 2019-08-12 Not Missing 31.1 28.0 36.2  0.3     !NA     !NA     !NA     !NA
## 179 2019-08-13 Not Missing 30.4 27.8 35.0  2.0     !NA     !NA     !NA     !NA
## 180 2019-08-13 Not Missing 30.4 27.8 35.0  2.0     !NA     !NA     !NA     !NA
## 181 2019-08-14 Not Missing 30.5 25.6 35.5 10.9     !NA     !NA     !NA     !NA
## 182 2019-08-14 Not Missing 30.5 25.6 35.5 10.9     !NA     !NA     !NA     !NA
## 183 2019-08-15 Not Missing 30.6 24.8 34.6 10.9     !NA     !NA     !NA     !NA
## 184 2019-08-15 Not Missing 30.6 24.8 34.6 10.9     !NA     !NA     !NA     !NA
## 185 2019-08-16 Not Missing 29.9 26.6 34.8  0.0     !NA     !NA     !NA     !NA
## 186 2019-08-16 Not Missing 29.9 26.6 34.8  0.0     !NA     !NA     !NA     !NA
## 187 2019-08-17 Not Missing 26.9 25.2 32.5  7.1     !NA     !NA     !NA     !NA
## 188 2019-08-17 Not Missing 26.9 25.2 32.5  7.1     !NA     !NA     !NA     !NA
## 189 2019-08-18 Not Missing 27.3 24.8 29.6 46.0     !NA     !NA     !NA     !NA
## 190 2019-08-18 Not Missing 27.3 24.8 29.6 46.0     !NA     !NA     !NA     !NA
## 191 2019-08-19 Not Missing 30.7 23.6 34.0  2.0     !NA     !NA     !NA     !NA
## 192 2019-08-19 Not Missing 30.7 23.6 34.0  2.0     !NA     !NA     !NA     !NA
## 193 2019-08-20 Not Missing 31.4 25.6 35.0  0.0     !NA     !NA     !NA     !NA
## 194 2019-08-20 Not Missing 31.4 25.6 35.0  0.0     !NA     !NA     !NA     !NA
## 195 2019-08-21 Not Missing 32.0 24.8 36.0  0.0     !NA     !NA     !NA     !NA
## 196 2019-08-21 Not Missing 32.0 24.8 36.0  0.0     !NA     !NA     !NA     !NA
## 197 2019-08-23 Not Missing 31.8 24.6 36.0  0.0     !NA     !NA     !NA     !NA
## 198 2019-08-23 Not Missing 31.8 24.6 36.0  0.0     !NA     !NA     !NA     !NA
## 199 2019-08-25 Not Missing 29.3 26.8 33.8  0.3     !NA     !NA     !NA     !NA
## 200 2019-08-25 Not Missing 29.3 26.8 33.8  0.3     !NA     !NA     !NA     !NA
## 201 2019-08-26 Not Missing 31.9 25.7 35.0  3.0     !NA     !NA     !NA     !NA
## 202 2019-08-26 Not Missing 31.9 25.7 35.0  3.0     !NA     !NA     !NA     !NA
## 203 2019-08-27 Not Missing 32.9 27.4 36.4  0.0     !NA     !NA     !NA     !NA
## 204 2019-08-27 Not Missing 32.9 27.4 36.4  0.0     !NA     !NA     !NA     !NA
## 205 2019-09-15 Not Missing 30.7 27.3 35.2  2.0     !NA     !NA     !NA     !NA
## 206 2019-09-15 Not Missing 30.7 27.3 35.2  2.0     !NA     !NA     !NA     !NA
## 207 2019-09-16 Not Missing 32.3 27.1 36.0  0.0     !NA     !NA     !NA     !NA
## 208 2019-09-16 Not Missing 32.3 27.1 36.0  0.0     !NA     !NA     !NA     !NA
## 209 2019-09-17 Not Missing 32.3 27.5 36.4  5.1     !NA     !NA     !NA     !NA
## 210 2019-09-17 Not Missing 32.3 27.5 36.4  5.1     !NA     !NA     !NA     !NA
## 211 2019-09-18 Not Missing 30.9 26.2 36.8  0.3     !NA     !NA     !NA     !NA
## 212 2019-09-18 Not Missing 30.9 26.2 36.8  0.3     !NA     !NA     !NA     !NA
## 213 2019-09-19 Not Missing 29.1 25.1 35.1  0.0     !NA     !NA     !NA     !NA
## 214 2019-09-19 Not Missing 29.1 25.1 35.1  0.0     !NA     !NA     !NA     !NA
## 215 2019-09-20 Not Missing 30.8 24.9 35.0  0.3     !NA     !NA     !NA     !NA
## 216 2019-09-20 Not Missing 30.8 24.9 35.0  0.3     !NA     !NA     !NA     !NA
## 217 2019-09-21 Not Missing 28.0 24.0 35.3  0.0     !NA     !NA     !NA     !NA
## 218 2019-09-21 Not Missing 28.0 24.0 35.3  0.0     !NA     !NA     !NA     !NA
## 219 2019-09-22 Not Missing 27.2 23.7 35.2 33.0     !NA     !NA     !NA     !NA
## 220 2019-09-22 Not Missing 27.2 23.7 35.2 33.0     !NA     !NA     !NA     !NA
## 221 2019-09-23 Not Missing 29.1 24.0 32.0  0.0     !NA     !NA     !NA     !NA
## 222 2019-09-23 Not Missing 29.1 24.0 32.0  0.0     !NA     !NA     !NA     !NA
## 223 2019-09-24 Not Missing 29.4 23.9 34.0  0.0     !NA     !NA     !NA     !NA
## 224 2019-09-24 Not Missing 29.4 23.9 34.0  0.0     !NA     !NA     !NA     !NA
## 225 2019-09-29 Not Missing 27.7 24.2 33.0  0.3     !NA     !NA     !NA     !NA
## 226 2019-09-29 Not Missing 27.7 24.2 33.0  0.3     !NA     !NA     !NA     !NA
## 227 2019-09-30 Not Missing 27.4 23.8 31.2  0.0     !NA     !NA     !NA     !NA
## 228 2019-09-30 Not Missing 27.4 23.8 31.2  0.0     !NA     !NA     !NA     !NA
## 229 2019-10-01 Not Missing 28.7 23.8 32.4  0.0     !NA     !NA     !NA     !NA
## 230 2019-10-01 Not Missing 28.7 23.8 32.4  0.0     !NA     !NA     !NA     !NA
## 231 2019-10-02 Not Missing 29.0 23.4 33.0  2.0     !NA     !NA     !NA     !NA
## 232 2019-10-03 Not Missing 28.8 21.4 34.4  0.0     !NA     !NA     !NA     !NA
## 233 2019-10-04 Not Missing 27.2 19.8 34.8 47.0     !NA     !NA     !NA     !NA
## 234 2019-10-04 Not Missing 27.2 19.8 34.8 47.0     !NA     !NA     !NA     !NA
## 235 2019-10-05 Not Missing 27.3 21.0 32.0  0.0     !NA     !NA     !NA     !NA
## 236 2019-10-05 Not Missing 27.3 21.0 32.0  0.0     !NA     !NA     !NA     !NA
## 237 2019-10-19 Not Missing 27.3 19.8 34.7  0.0     !NA     !NA     !NA     !NA
## 238 2019-10-19 Not Missing 27.3 19.8 34.7  0.0     !NA     !NA     !NA     !NA
## 239 2019-11-03 Not Missing 23.4 18.7 29.5  0.3     !NA     !NA     !NA     !NA
## 240 2019-11-03 Not Missing 23.4 18.7 29.5  0.3     !NA     !NA     !NA     !NA
## 241 2019-11-04 Not Missing 24.1 18.9 29.4  0.0     !NA     !NA     !NA     !NA
## 242 2019-11-04 Not Missing 24.1 18.9 29.4  0.0     !NA     !NA     !NA     !NA
## 243 2019-11-26 Not Missing 21.6 16.2 27.2  0.0     !NA     !NA     !NA     !NA
## 244 2019-11-26 Not Missing 21.6 16.2 27.2  0.0     !NA     !NA     !NA     !NA
## 245 2019-11-28 Not Missing 18.3 16.2 27.4  0.0     !NA     !NA     !NA     !NA
## 246 2019-11-28 Not Missing 18.3 16.2 27.4  0.0     !NA     !NA     !NA     !NA
## 247 2019-11-29 Not Missing 18.1 12.5 24.6  0.0     !NA     !NA     !NA     !NA
## 248 2019-11-29 Not Missing 18.1 12.5 24.6  0.0     !NA     !NA     !NA     !NA
## 249 2019-12-13 Not Missing 15.2 12.8 21.5 34.0     !NA     !NA     !NA     !NA
## 250 2019-12-13 Not Missing 15.2 12.8 21.5 34.0     !NA     !NA     !NA     !NA
## 251 2019-12-14 Not Missing 14.7  9.7 18.7  0.5     !NA     !NA     !NA     !NA
## 252 2019-12-14 Not Missing 14.7  9.7 18.7  0.5     !NA     !NA     !NA     !NA
## 253 2019-12-15 Not Missing 14.2  9.8 19.2  0.0     !NA     !NA     !NA     !NA
## 254 2019-12-15 Not Missing 14.2  9.8 19.2  0.0     !NA     !NA     !NA     !NA
## 255 2020-01-07 Not Missing 15.0 11.6 19.1  0.0     !NA     !NA     !NA     !NA
## 256 2020-01-08 Not Missing 13.2 11.9 19.0  6.1     !NA     !NA     !NA     !NA
## 257 2020-01-08 Not Missing 13.2 11.9 19.0  6.1     !NA     !NA     !NA     !NA
## 258 2020-01-09 Not Missing 10.4  8.4 14.6 10.9     !NA     !NA     !NA     !NA
## 259 2020-01-09 Not Missing 10.4  8.4 14.6 10.9     !NA     !NA     !NA     !NA
## 260 2020-01-10 Not Missing 10.7  5.3 17.4  0.0     !NA     !NA     !NA     !NA
## 261 2020-01-10 Not Missing 10.7  5.3 17.4  0.0     !NA     !NA     !NA     !NA
## 262 2020-01-16 Not Missing 13.2  9.5 19.9  0.0     !NA     !NA     !NA     !NA
## 263 2020-01-16 Not Missing 13.2  9.5 19.9  0.0     !NA     !NA     !NA     !NA
## 264 2020-01-17 Not Missing 13.2 11.2 17.4  6.1     !NA     !NA     !NA     !NA
## 265 2020-01-17 Not Missing 13.2 11.2 17.4  6.1     !NA     !NA     !NA     !NA
## 266 2020-01-18 Not Missing 12.3  8.4 17.7  0.0     !NA     !NA     !NA     !NA
## 267 2020-01-18 Not Missing 12.3  8.4 17.7  0.0     !NA     !NA     !NA     !NA
##     prcp_NA StationId  PM2.5   PM10    NO    NO2   NOx   NH3   CO   SO2    O3
## 1       !NA     DL019  48.03  89.10  4.09  39.86 24.30 18.91 0.68 12.71  9.14
## 2       !NA     DL019  23.98  38.46  3.64  34.88 21.51 26.11 0.52 11.41  6.42
## 3       !NA     DL019  34.77  60.62 11.30  53.24 37.53 38.76 0.75  9.87 11.59
## 4       !NA     DL019  42.65 113.91  5.90  50.46 31.40 21.05 0.91 12.43 10.90
## 5       !NA     DL019  44.09 138.82  2.30  34.02 19.90 27.99 0.56  7.27 13.37
## 6       !NA     DL019  48.80 110.93  8.11  29.32 22.19 35.32 0.53 10.14 15.38
## 7       !NA     DL019  59.80 114.52  1.66  32.95 18.81 28.86 0.60 10.64 19.23
## 8       !NA     DL019  49.42 101.33  4.94  38.88 24.63 22.11 0.61 11.04 18.74
## 9       !NA     DL019  40.46  94.12  9.68  32.98 25.43 23.14 0.50 10.86 10.50
## 10      !NA     DL019  33.39  85.74  5.23  32.80 21.71 23.95 0.51 10.98  9.65
## 11      !NA     DL019  35.95  90.71 17.23  46.91 39.00 30.17 0.70 11.83  8.67
## 12      !NA     DL019  41.92  90.96 20.37  74.41 56.11 49.97 0.64 10.83 12.26
## 13      !NA     DL019  40.77 108.08 21.07  68.90 53.83 56.35 0.52 10.22 17.89
## 14      !NA     DL019  36.03  99.81 13.89  64.51 45.63 49.08 0.48 13.14 11.81
## 15      !NA     DL019  39.06  88.27  6.45  73.05 44.00 30.64 0.72 13.22  9.06
## 16      !NA     DL019  47.97  97.82 17.46  64.82 47.08 43.67 0.63 17.02  9.26
## 17      !NA     DL019  22.64  59.55  3.46  33.60 20.66 21.08 0.87 17.24 10.32
## 18      !NA     DL019  27.32  79.45  6.11  31.43 21.70 21.78 0.69 12.70 10.95
## 19      !NA     DL019  40.11 101.69  7.81  29.34 21.88 33.71 0.78 12.22 11.37
## 20      !NA     DL019  27.56  76.45  8.17  27.78 21.44 44.72 0.70 11.84  8.60
## 21      !NA     DL019  18.45  58.76  4.48  30.09 19.66 43.76 0.70 11.46  8.75
## 22      !NA     DL019  32.40  88.77  4.75  26.28 17.83 42.84 0.68 12.00 10.14
## 23      !NA     DL019  41.42 130.54  5.49  24.50 17.49 41.32 0.66 11.67 10.44
## 24      !NA     DL019  37.15 129.03  4.97  23.76 16.65 39.42 0.63 11.74 11.72
## 25      !NA     DL019  70.14 177.67  7.05  36.56 25.19 40.30 0.67 12.47 10.18
## 26      !NA     DL019  58.59 152.86  9.93  31.35 24.77 41.15 0.70 11.59  9.32
## 27      !NA     DL019  47.34 128.71 14.22  34.28 29.81 41.15 0.69 11.42 16.40
## 28      !NA     DL019  36.09  96.95 13.76  32.23 28.36 44.02 0.70  9.82 10.43
## 29      !NA     DL019  32.29  98.68  9.28  32.25 24.72 44.54 0.53 11.67 18.31
## 30      !NA     DL019 154.65 249.46 16.98  61.42 46.51 41.21 1.80  4.86 15.92
## 31      !NA     DL019  93.58 145.83  3.81  45.51 27.30 40.69 1.64  5.53 30.92
## 32      !NA     DL019 134.31 177.53 54.66  67.76 80.62 47.55 2.80  2.89  6.16
## 33      !NA     DL001 150.41 214.16  6.35  96.17 56.30 36.02 0.99 12.74 41.87
## 34      !NA     DL001 106.11 154.36  7.36  77.52 47.21 30.45 0.70 12.46 27.93
## 35      !NA     DL019  97.99 137.40 24.49  57.58 50.50 51.82 2.28  2.85 10.66
## 36      !NA     DL001 146.23 211.34  2.01  60.29 33.65 28.52 0.71 11.64 46.12
## 37      !NA     DL019 134.32 171.98  5.22  57.42 34.80 53.13 2.23  3.64 13.95
## 38      !NA     DL019 209.72 255.19 48.14  45.82 63.38  6.79 1.62  3.95 22.71
## 39      !NA     DL001 210.33 290.50  7.68  73.71 45.47 33.01 1.21 21.13 20.55
## 40      !NA     DL019  86.50 118.83 10.07  35.82 27.17  7.81 1.12  4.75  9.87
## 41      !NA     DL001  78.00 115.42  5.47  61.26 37.06 24.49 0.57 18.57 18.86
## 42      !NA     DL001  98.81 158.25  9.09  68.82 44.01 26.74 0.59 18.66 18.43
## 43      !NA     DL019 101.20 133.30  8.75  49.72 31.25 32.18 1.14  3.50 18.23
## 44      !NA     DL001 150.23 196.81  2.24  91.81 50.66 44.10 0.87 22.62 23.96
## 45      !NA     DL019 168.74 187.08 28.77 112.54 83.32 54.21 0.74  5.32 21.11
## 46      !NA     DL001 191.15 321.77  6.67  95.19 56.02 44.42 1.19 26.78 27.61
## 47      !NA     DL019 147.88 207.89 67.89  73.26 94.31 36.03 0.79  7.33  6.20
## 48      !NA     DL001  39.77  46.11  2.14  52.05 29.42 27.64 0.45 21.19 30.98
## 49      !NA     DL019  55.65  85.27 10.90  62.96 42.36 34.35 0.90  6.16  7.20
## 50      !NA     DL001 104.21 162.48 39.24  56.00 61.83 26.57 0.98 25.83 20.15
## 51      !NA     DL019 146.05 195.46 59.46  52.54 76.42 37.90 1.60  8.21  6.30
## 52      !NA     DL001 102.38 160.79  8.33  50.07 33.41 28.12 0.71 21.38 25.92
## 53      !NA     DL019 119.33 176.41 33.24  54.55 56.11 32.59 1.19  8.01 14.91
## 54      !NA     DL001  76.20 113.33  2.76  53.11 30.51 27.75 0.50 31.74 28.56
## 55      !NA     DL019  72.98 123.75  7.71  57.39 36.81 29.03 0.93  7.45 13.34
## 56      !NA     DL019  89.44 139.14  4.02  56.49 33.31 26.86 0.83  7.12 13.92
## 57      !NA     DL001  88.94 135.61  2.73  45.81 26.61 26.01 0.59 23.34 32.51
## 58      !NA     DL001 191.46 252.49  2.51  85.07 47.28 49.65 1.04 27.00 28.53
## 59      !NA     DL019 166.52 217.06 15.66  69.95 49.96 37.94 1.05  6.26  4.75
## 60      !NA     DL001 172.49 244.02  2.52  61.92 34.99 43.08 0.79 21.83 23.07
## 61      !NA     DL019 185.62 236.47  4.23  49.53 29.80 39.63 1.15  6.56  4.53
## 62      !NA     DL001 213.33 311.04 17.63  85.95 60.09 45.73 1.63 27.71 24.50
## 63      !NA     DL019 188.02 283.68 56.11  58.54 76.89 40.28 1.09  8.03  7.73
## 64      !NA     DL001  70.52 101.02  1.82  54.99 30.69 30.62 0.59 23.19 28.01
## 65      !NA     DL019  65.24 132.07  4.34  37.01 23.22 30.98 0.81  6.75 11.12
## 66      !NA     DL019  63.20 155.56 18.80  37.41 35.22 22.11 0.92  7.22 13.03
## 67      !NA     DL001  65.52 126.50 25.78  44.60 44.75 21.79 0.62 26.44 23.73
## 68      !NA     DL001  77.25 140.38  4.53  43.58 26.88 23.22 0.53 26.43 30.43
## 69      !NA     DL019  68.77 159.78  6.96  35.30 24.45 21.50 0.87  8.52 13.42
## 70      !NA     DL019 159.43 244.48 38.12  60.53 63.27 31.33 1.19  8.34  5.36
## 71      !NA     DL001 174.76 246.32 13.33  81.38 54.15 36.65 1.39 28.40 31.81
## 72      !NA     DL001 120.54 157.00  2.75  55.38 31.69 28.87 0.87 31.67 29.82
## 73      !NA     DL019 148.94 231.57 10.78  50.74 35.77 31.36 0.99  8.43  6.65
## 74      !NA     DL019 117.91 193.99  3.10  35.12 21.21 27.14 0.98  7.52  6.97
## 75      !NA     DL001 106.16 155.68  2.63  46.76 27.03 25.11 0.66 28.29 24.06
## 76      !NA     DL001 125.07 192.40  7.38  64.35 40.32 30.37 0.91 24.55 43.45
## 77      !NA     DL019 126.11 237.02 30.65  47.81 50.39 29.05 1.38  7.56  9.69
## 78      !NA     DL019 108.52 215.71 41.71  48.71 59.91 28.32 1.01  9.43 14.43
## 79      !NA     DL001  87.22 123.58  3.12  56.47 32.55 27.17 0.76 21.91 19.45
## 80      !NA     DL001  68.33 142.60  5.38  56.59 34.46 22.83 0.73 27.83 27.84
## 81      !NA     DL019  63.25 171.69 37.30  43.02 53.29 25.03 1.14  8.14  7.82
## 82      !NA     DL019  74.88 159.88 16.65  30.72 29.91 25.59 0.85  6.65  8.39
## 83      !NA     DL001  65.97 118.76  5.54  50.62 31.44 25.23 0.53 22.70 29.35
## 84      !NA     DL001  52.27  82.18  2.21  43.15 24.71 16.79 0.45 26.89 35.16
## 85      !NA     DL019  56.93 133.00 16.89  57.14 44.14 21.03 1.00  8.73  8.54
## 86      !NA     DL001  42.78  68.55  3.01  36.94 22.10 14.80 0.40 22.35 21.55
## 87      !NA     DL019  91.84 176.40 43.90  63.85 69.74 18.87 1.11  9.27  4.59
## 88      !NA     DL001 138.18 223.32 20.19  71.38 54.39 24.18 0.99 29.02  9.77
## 89      !NA     DL001  61.74  81.40  2.16  40.92 23.53 19.72 0.54 25.56 38.38
## 90      !NA     DL019  61.68 115.89 16.01  44.02 36.45 19.22 0.94  6.95  7.99
## 91      !NA     DL001  76.20 114.85 10.49  47.07 33.59 20.26 0.65 26.95 28.00
## 92      !NA     DL019 105.91 188.61 51.91  31.14 58.84 21.19 1.05  7.07 32.26
## 93      !NA     DL001  71.82 135.15  5.63  43.70 27.84 19.20 0.51 26.19 33.92
## 94      !NA     DL019  63.73 132.50  5.30  34.74 22.69 17.55 0.89  6.88 44.88
## 95      !NA     DL001  65.60 149.44 11.06  50.46 35.84 20.33 0.51 32.84 38.21
## 96      !NA     DL019  65.38 163.77  9.81  42.77 30.70 15.86 0.86  9.43  9.38
## 97      !NA     DL019  72.04 159.52  2.92  36.21 21.12 20.37 0.83  9.15 23.64
## 98      !NA     DL001  59.90 119.92 21.80  56.28 47.72 21.08 0.51 25.61 36.05
## 99      !NA     DL001  60.85 114.67  1.85  52.28 29.31 20.44 0.44 27.45 50.86
## 100     !NA     DL019  63.29 149.71  5.00  41.93 26.36 18.62 0.73  8.34 18.36
## 101     !NA     DL019  48.68 196.27  4.82  41.01 25.74 20.79 0.63 10.52 22.62
## 102     !NA     DL019  52.56 184.04  4.54  42.27 25.74 19.63 0.85 10.67 25.11
## 103     !NA     DL019  79.36 293.61 24.63  54.42 49.03 24.12 1.05 10.76 20.50
## 104     !NA     DL019 143.06 348.88 36.67  52.61 57.76 29.21 1.12 10.10 23.44
## 105     !NA     DL019  48.56 230.41  8.45  45.13 30.50 22.73 0.90  9.62 20.94
## 106     !NA     DL019  40.45 169.65  5.22  40.98 26.04 23.85 0.72 10.07 19.98
## 107     !NA     DL019  42.68 183.83  7.75  48.28 31.99 21.72 0.84 11.92 18.62
## 108     !NA     DL019  66.44 168.85  9.90  43.25 31.07 22.89 0.84  7.58 18.10
## 109     !NA     DL019  27.12  76.50  4.31  36.48 22.90 20.31 0.67  7.68 20.07
## 110     !NA     DL019  38.28 111.35  5.60  41.57 26.29 20.85 0.77  9.11 23.55
## 111     !NA     DL019  63.06 144.54 16.82  50.42 40.53 20.87 0.88 11.75 21.30
## 112     !NA     DL019  88.20 195.73 10.97  54.53 36.33 30.18 0.81 10.69 21.82
## 113     !NA     DL019 103.65 234.47 42.04  68.22 67.57 35.99 0.74 12.78 17.04
## 114     !NA     DL019 101.77 293.29 10.79  42.76 31.16 20.02 0.87 11.65 24.60
## 115     !NA     DL019  57.80 131.08  9.73  43.60 28.85 17.94 0.83  8.31 24.87
## 116     !NA     DL019  85.39 208.42 53.08  70.94 80.64 27.12 1.15  8.95 18.56
## 117     !NA     DL019  76.67 220.00 39.28  64.30 62.63 30.79 0.95  8.41 20.23
## 118     !NA     DL019  78.33 344.09 37.56  56.10 55.83 23.64 0.87  7.59 20.45
## 119     !NA     DL019  43.61 123.89  7.87  29.20 18.74 28.60 0.74  7.04 24.10
## 120     !NA     DL019  83.21 209.58  8.41  38.00 26.96 25.14 0.83  8.14 28.73
## 121     !NA     DL019  44.15 127.02 18.11  31.04 30.90 40.73 0.79  9.83 19.49
## 122     !NA     DL019  35.08 121.60  5.12  32.40 21.44 26.51 0.85 12.37 26.73
## 123     !NA     DL019  68.79 172.15  4.91  25.83 17.74 46.24 0.70  7.92 24.92
## 124     !NA     DL019  64.84 153.50  5.63  30.68 20.88 42.71 0.79  7.95 22.25
## 125     !NA     DL019  34.55  94.30 16.34  44.70 35.00 28.79 0.86  6.04 19.00
## 126     !NA     DL019  93.17 241.93 20.11  51.61 43.85 27.04 0.61  8.18 26.65
## 127     !NA     DL019  63.18 186.10  9.69  73.96 46.90 21.34 0.77  8.09 32.03
## 128     !NA     DL019  45.21 118.69 11.17  35.18 27.56 14.99 0.87  7.36 17.54
## 129     !NA     DL001  47.45  75.03  2.85  49.05 28.41 57.51 0.66 23.86 24.61
## 130     !NA     DL019  64.39 135.59 21.94  39.47 38.85 16.95 0.81  7.58 15.18
## 131     !NA     DL001  42.46  74.21  3.21  51.40 29.99 54.50 0.61 21.37 39.84
## 132     !NA     DL019  20.55 103.28  8.98  35.52 26.22 17.80 0.81  6.94 18.97
## 133     !NA     DL001  43.80  70.18  1.11  32.24 18.05 55.02 0.53 23.36 45.17
## 134     !NA     DL019  34.34 107.30  6.22  32.23 22.20 15.30 0.64  7.03 21.85
## 135     !NA     DL001  54.62 140.25  2.16  42.45 24.34 36.26 0.64 18.98 44.54
## 136     !NA     DL019  45.93 169.15  7.54  40.61 27.74 15.03 0.80  7.73 20.40
## 137     !NA     DL001  24.15  36.65  1.12  23.82 13.61 17.10 0.88 10.79 37.05
## 138     !NA     DL019  38.59  92.92  6.26  39.53 25.99 20.62 0.91  4.35 16.46
## 139     !NA     DL001  32.23  57.92  1.42  34.34 19.42 19.36 0.76 11.85 42.35
## 140     !NA     DL019  33.97  80.83 15.40  49.26 36.63 21.24 1.06  4.28 16.92
## 141     !NA     DL001  71.08 138.35  2.90  42.43 24.95 23.11 1.04 20.86 74.89
## 142     !NA     DL019  63.72 146.20 54.57  48.09 67.12 21.17 1.12  3.95 20.31
## 143     !NA     DL001  43.86  84.22  1.66  47.05 26.47 28.09 0.67 11.64 22.63
## 144     !NA     DL019  58.94 131.49 11.36  57.36 39.70 22.36 1.00  3.49 14.91
## 145     !NA     DL001  55.20  85.48  2.14  43.25 24.72 17.93 2.15 11.15 19.66
## 146     !NA     DL019  58.51 111.60 22.58  46.04 42.73 27.93 0.85  3.76 23.57
## 147     !NA     DL019  69.38 122.27 52.13  43.53 65.52 34.12 1.12  3.71 17.17
## 148     !NA     DL001  53.82  80.07  1.96  46.47 26.17 29.33 1.49 10.42 24.31
## 149     !NA     DL019  89.95 160.75 46.38  57.51 68.24 31.60 1.44  3.93 28.22
## 150     !NA     DL001  49.88  77.04  1.49  67.93 37.33 34.69 2.15  9.93 21.56
## 151     !NA     DL001  65.75 152.15  2.70  68.72 38.75 44.05 0.98 11.27 30.19
## 152     !NA     DL019  79.58 145.19 33.93  54.49 56.51 28.33 0.87  3.94 17.12
## 153     !NA     DL001  19.18  44.81  1.72  55.82 31.07 63.55 0.73 10.20 17.24
## 154     !NA     DL019  30.30  73.56 11.44  40.74 30.17 22.51 0.89  3.91 15.14
## 155     !NA     DL001  34.31  43.86  1.33  41.42 23.12 32.85 0.61 12.76 26.60
## 156     !NA     DL019  40.11  85.79 13.89  47.02 36.33 26.32 0.88  3.28 13.14
## 157     !NA     DL019  33.53  79.10 13.98  40.59 32.16 25.13 0.88  3.75 14.18
## 158     !NA     DL001  24.82  20.09  1.21  26.91 15.30 21.83 0.62 10.06 24.13
## 159     !NA     DL001  21.00  23.73  1.25  27.88 15.86 22.64 0.57 11.31 21.73
## 160     !NA     DL019  21.00  69.27  6.18  30.82 21.16 20.55 0.64  3.80 11.30
## 161     !NA     DL019  19.85  73.21  5.14  32.82 21.45 17.52 0.67  3.64 12.95
## 162     !NA     DL001  20.23  30.11  1.21  36.57 20.70 29.44 0.59 12.79 24.47
## 163     !NA     DL001  24.44  36.65  1.31  45.32 25.17 34.19 0.69 11.29 22.74
## 164     !NA     DL019  27.57  89.08  6.04  37.27 24.49 18.86 0.77  3.60 12.81
## 165     !NA     DL019  26.96  88.10  4.67  36.28 23.01 19.32 0.91  4.02 13.46
## 166     !NA     DL001  26.58  43.81  1.13  39.98 22.19 30.86 0.56 11.51 27.06
## 167     !NA     DL001  50.96  77.29  4.98  70.76 41.71 46.58 0.94 10.66  6.76
## 168     !NA     DL019  26.39  73.99 23.17  49.17 44.14 30.47 0.93  4.06  9.84
## 169     !NA     DL001  24.22  29.95  3.06  48.51 28.30 32.54 0.55 12.30 18.95
## 170     !NA     DL019  30.69  92.73 25.54  39.67 40.61 21.75 1.10  5.11 15.49
## 171     !NA     DL019  18.32  86.28 11.46  42.90 32.05 16.76 0.94  6.31 19.06
## 172     !NA     DL001  22.54  34.75  1.51  32.67 18.61 24.72 0.59 12.84 29.70
## 173     !NA     DL001  18.38  27.38  1.25  24.00 13.77 20.41 1.11 12.44 34.80
## 174     !NA     DL019  14.42  68.50  4.87  30.48 20.15 16.87 0.68  3.54 11.09
## 175     !NA     DL001  31.00  54.17  1.33  39.64 23.15 29.22 2.07 13.10 35.91
## 176     !NA     DL019  26.24  80.58  4.96  33.22 21.65 16.69 0.74  4.21 13.16
## 177     !NA     DL019  17.52  72.61  4.53  26.55 17.57 16.81 0.79  3.68 12.75
## 178     !NA     DL001  23.12  51.64  1.75  47.33 26.60 29.90 1.17 12.01 22.06
## 179     !NA     DL019  20.46  83.27 14.22  28.45 26.71 16.21 0.85  3.57 11.34
## 180     !NA     DL001  22.85  42.51  3.28  47.21 27.77 30.39 1.17 10.98 16.34
## 181     !NA     DL019  42.35  93.96  9.75  32.56 25.20 18.26 0.83  3.21 15.71
## 182     !NA     DL001  24.55  30.97  1.67  38.25 21.71 27.03 0.98 11.73 30.61
## 183     !NA     DL019  31.03  84.30 11.87  24.54 22.73 17.06 0.74  3.19 11.78
## 184     !NA     DL001  20.74  34.86  1.94  35.06 20.23 27.33 0.74 10.46 23.76
## 185     !NA     DL001  12.55  18.06  1.01  18.55 10.69 17.01 0.61 10.65 30.45
## 186     !NA     DL019  16.71  68.16  6.19  26.68 19.19 14.44 0.74  3.55 10.76
## 187     !NA     DL019  14.86  49.05  8.84  20.54 18.09 13.03 0.68  3.71  9.75
## 188     !NA     DL001  10.06  10.75  1.35  17.53 10.43 14.90 0.66 10.06 16.16
## 189     !NA     DL019  19.09  53.43  8.26  18.77 16.69 13.16 0.74  3.91 11.67
## 190     !NA     DL001  12.17  11.54  1.31  17.46 10.37 14.89 0.76 10.43 13.92
## 191     !NA     DL019  39.69 101.10 24.08  24.40 32.49 13.19 0.99  3.17 14.82
## 192     !NA     DL001  35.17  52.60  3.79  26.92 17.40 21.70 0.81 10.50 21.74
## 193     !NA     DL019  41.58 103.01 14.88  33.98 29.26 17.01 0.87  2.73 12.85
## 194     !NA     DL001  34.21  65.97  4.93  34.44 22.34 28.25 0.77 11.47 23.82
## 195     !NA     DL019  47.48 133.83  8.92  42.71 29.99 23.84 0.76  2.35 15.48
## 196     !NA     DL001  45.26 106.77  3.22  37.21 22.41 26.20 0.79 10.62 15.46
## 197     !NA     DL019  50.69 124.96 49.95  32.06 57.44 23.93 0.94  3.07 12.16
## 198     !NA     DL001  54.67 126.70 23.44  65.71 54.08 33.30 1.15 11.17 18.10
## 199     !NA     DL001  52.11  79.86  1.87  48.46 27.36 26.43 0.92 12.09 19.87
## 200     !NA     DL019  31.08  79.48 17.10  26.83 28.21 21.81 0.85  4.12  9.60
## 201     !NA     DL019  23.94  72.95 23.47  35.11 37.81 21.00 0.87  3.38 11.77
## 202     !NA     DL001  15.35  30.65  1.39  30.86 17.55 19.04 0.62 10.87 19.69
## 203     !NA     DL001  29.75  52.36  2.50  26.98 16.16 18.98 0.71 11.11 18.22
## 204     !NA     DL019  35.19  98.52 23.78  39.56 40.32 20.89 0.95  3.67 15.01
## 205     !NA     DL019  39.41  91.71 10.58  31.13 25.11 14.49 0.88  3.95 17.75
## 206     !NA     DL001  28.75  47.73  1.30  25.54 14.64 18.41 0.61  9.42 19.78
## 207     !NA     DL019  53.96 121.30 23.45  29.47 34.59 15.10 1.02  4.77 21.23
## 208     !NA     DL001  49.98 100.67  6.72  36.49 24.88 27.82 0.83 10.02 36.94
## 209     !NA     DL019  57.39 129.73 11.07  35.22 27.72 13.47 1.00  4.35 19.99
## 210     !NA     DL001  52.58 104.31  1.97  34.85 20.14 26.68 0.74 10.25 44.14
## 211     !NA     DL001  32.61  71.81  1.23  30.62 17.29 21.07 0.56 13.55 41.14
## 212     !NA     DL019  37.29 107.50  4.22  36.34 22.70 11.27 0.73  4.92 16.54
## 213     !NA     DL019  20.60  76.90  3.95  27.00 17.56  9.84 0.75  4.30 15.21
## 214     !NA     DL001  17.17  40.71  1.03  20.78 11.90 16.81 0.50 13.76 30.34
## 215     !NA     DL019  30.98  88.04  8.97  27.10 21.71  9.32 0.78  4.61 14.38
## 216     !NA     DL001  38.12  78.35  1.71  30.01 17.35 21.57 0.62 16.35 39.36
## 217     !NA     DL001  60.40 103.46  5.39  37.89 24.58 27.24 0.87 14.04 33.37
## 218     !NA     DL019  33.33  90.01 11.16  21.81 20.71  8.78 0.80  5.08 16.26
## 219     !NA     DL019  22.10  73.99 14.84  22.41 24.01  7.35 0.73  4.64 14.03
## 220     !NA     DL001  31.81  52.50  4.99  47.46 29.53 30.15 0.56 10.32 20.23
## 221     !NA     DL001  26.15  46.15  1.26  36.90 20.66 24.37 0.69 12.32 31.13
## 222     !NA     DL019  35.91 102.47  8.79  25.26 20.57  8.01 0.80  3.98 16.60
## 223     !NA     DL001  28.42  58.85  4.64  39.32 24.71 23.01 0.84  9.78 21.28
## 224     !NA     DL019  36.50 106.48 10.67  25.92 22.47  7.92 0.91  4.65 16.18
## 225     !NA     DL001  16.20  30.37  1.33  21.24 12.38 18.27 0.65 14.19 19.78
## 226     !NA     DL019  19.81  48.44  9.93  40.81 29.72 14.96 0.55  2.84 18.30
## 227     !NA     DL001  30.50  60.81  1.85  24.73 14.66 17.77 0.79 13.56 34.69
## 228     !NA     DL019  33.04  81.04 15.44  48.84 37.98 16.13 0.77  2.48 18.95
## 229     !NA     DL001  40.60  83.08  3.64  34.48 21.30 19.68 0.92 13.75 28.40
## 230     !NA     DL019  32.58  78.65 16.01  49.26 39.06 15.39 0.75  6.38 20.76
## 231     !NA     DL001  37.80  66.03  6.37  33.32 22.92 24.17 0.97 16.69 26.08
## 232     !NA     DL001  47.76  94.13  3.27  34.13 20.82 24.91 0.79 17.05 26.00
## 233     !NA     DL001  39.07  75.68  3.38  36.32 23.30 22.42 0.76 13.89 32.54
## 234     !NA     DL019  40.41  83.02 12.68  49.69 36.48 15.48 0.77  3.38 23.59
## 235     !NA     DL019  35.08  78.99  9.70  43.71 30.92 13.76 0.78  2.82 23.46
## 236     !NA     DL001  35.93  88.98  6.20  33.47 22.86 17.83 0.77 11.80 21.12
## 237     !NA     DL001  68.20 122.67  7.38  62.18 39.08 28.39 0.98 16.03 38.18
## 238     !NA     DL019  70.11 151.54 12.17  64.29 43.96 17.11 1.11  2.26  9.80
## 239     !NA     DL001 734.56 830.10 20.72  85.46 62.22 72.95 0.58 20.49  1.19
## 240     !NA     DL019 440.92 467.61 19.69  61.27 48.63 57.64 2.19  2.10  6.20
## 241     !NA     DL001 332.56 461.98  1.91  43.24 24.52 40.56 0.62 20.31 24.77
## 242     !NA     DL019 195.63 345.15 20.51  49.44 42.92 34.54 1.69  2.57  9.07
## 243     !NA     DL019 110.42 250.46 44.49  81.54 79.64 20.11 1.11  5.05  5.36
## 244     !NA     DL001  91.46 174.06  4.36  47.23 28.68 11.13 0.29 27.51 41.28
## 245     !NA     DL001  24.76  45.07  2.19  24.24 14.68 15.94 0.28 10.05 40.49
## 246     !NA     DL019  28.50  50.68  6.26  47.56 30.39 17.71 0.86  2.60  6.18
## 247     !NA     DL019  44.48  74.74 19.24  49.16 41.78 18.21 1.15  3.52  6.61
## 248     !NA     DL001  40.33  76.52  9.96  39.90 29.35 18.19 0.38  8.01 26.16
## 249     !NA     DL001  86.82 110.94  2.34  36.47 21.31 23.87 0.97 13.54 38.81
## 250     !NA     DL019  91.49 123.75  5.39  49.23 30.57 30.46 1.26  3.10  9.15
## 251     !NA     DL019  68.75 107.60 18.33  47.52 40.21 25.29 1.15  3.95  6.80
## 252     !NA     DL001  59.38  96.82 15.28  38.18 32.78 22.10 0.93 11.91 21.85
## 253     !NA     DL001  85.81 128.54  8.44  37.92 27.06 22.70 0.89 13.23 22.09
## 254     !NA     DL019  90.01 134.80 15.73  40.80 34.51 25.81 1.19  4.24  9.01
## 255     !NA     DL001  95.12 121.26  3.85  29.23 18.68 24.64 1.15  9.58 14.12
## 256     !NA     DL019 117.11 151.92 30.51  42.87 47.84 33.02 1.61  3.96  3.89
## 257     !NA     DL001  88.85 108.21  3.85  27.46 17.74 21.40 0.96  7.13 11.60
## 258     !NA     DL019  86.00 123.29  4.48  36.07 22.76 26.83 1.07  3.60  7.08
## 259     !NA     DL001  77.09 104.93  3.00  19.28 12.71 23.75 0.83  8.26 17.46
## 260     !NA     DL001 113.53 161.86  4.36  29.42 19.20 27.56 1.02 11.43 20.82
## 261     !NA     DL019 119.33 180.12 10.17  47.02 33.31 25.70 1.67  4.11  8.86
## 262     !NA     DL001 117.48 170.85  5.80  28.96 20.13 20.36 1.05  9.41  4.36
## 263     !NA     DL019 135.35 205.21 18.61  51.61 42.63 33.82 1.38  4.82  3.78
## 264     !NA     DL001  99.90 137.06  7.12  22.38 17.70 20.11 1.34  8.25  4.99
## 265     !NA     DL019  85.73 132.42 17.57  40.04 35.61 30.63 1.40  4.17  4.89
## 266     !NA     DL001 104.63 153.11  7.44  32.91 23.58 19.53 1.35  8.82  4.89
## 267     !NA     DL019 116.68 188.74 27.20  47.55 47.47 29.63 1.40  4.42  5.75
##     Benzene Toluene Xylene AQI   AQI_Bucket StationId_NA Date_NA PM2.5_NA
## 1      1.74   11.65   1.60  80 Satisfactory          !NA     !NA      !NA
## 2      2.10    8.95   1.23  55 Satisfactory          !NA     !NA      !NA
## 3     10.66   11.07   4.41  59 Satisfactory          !NA     !NA      !NA
## 4      8.03   15.18   4.50  92 Satisfactory          !NA     !NA      !NA
## 5      3.03   10.98   1.84 152     Moderate          !NA     !NA      !NA
## 6      3.08   16.27   2.48 104     Moderate          !NA     !NA      !NA
## 7      1.98   13.30   1.66 109     Moderate          !NA     !NA      !NA
## 8      2.71   11.25   2.98  91 Satisfactory          !NA     !NA      !NA
## 9      3.29   15.64   2.96  91 Satisfactory          !NA     !NA      !NA
## 10     1.75    9.87   1.48  86 Satisfactory          !NA     !NA      !NA
## 11     2.71   17.79   3.68  88 Satisfactory          !NA     !NA      !NA
## 12     2.50   14.86   3.02  96 Satisfactory          !NA     !NA      !NA
## 13     3.31   14.58   3.06 105     Moderate          !NA     !NA      !NA
## 14     4.00    9.55   1.60  97 Satisfactory          !NA     !NA      !NA
## 15     6.14   15.44   3.38  88 Satisfactory          !NA     !NA      !NA
## 16     5.32   13.99   3.91 100 Satisfactory          !NA     !NA      !NA
## 17     4.57    8.92   2.84  75 Satisfactory          !NA     !NA      !NA
## 18     2.51   10.43   2.28  76 Satisfactory          !NA     !NA      !NA
## 19     2.32   14.93   2.02  89 Satisfactory          !NA     !NA      !NA
## 20     1.48   12.24   2.59  93 Satisfactory          !NA     !NA      !NA
## 21     1.67   15.14   2.51  63 Satisfactory          !NA     !NA      !NA
## 22     1.50   11.78   2.13  73 Satisfactory          !NA     !NA      !NA
## 23     1.02    8.94   1.85 100 Satisfactory          !NA     !NA      !NA
## 24     0.76    8.28   1.21 123     Moderate          !NA     !NA      !NA
## 25     1.21   10.71   2.43 133     Moderate          !NA     !NA      !NA
## 26     1.61   10.94   1.84 164     Moderate          !NA     !NA      !NA
## 27     1.30   11.69   2.03 124     Moderate          !NA     !NA      !NA
## 28     1.65   13.64   2.80 110     Moderate          !NA     !NA      !NA
## 29     1.69   13.13   2.45  97 Satisfactory          !NA     !NA      !NA
## 30     2.74   13.28   3.92 343    Very Poor          !NA     !NA      !NA
## 31     1.75   10.45   1.26 221         Poor          !NA     !NA      !NA
## 32     4.75   22.63   8.32 357    Very Poor          !NA     !NA      !NA
## 33     5.64   28.30   0.51 372    Very Poor          !NA     !NA      !NA
## 34     4.20   33.48   0.59 280         Poor          !NA     !NA      !NA
## 35     4.91   22.90   6.81 244         Poor          !NA     !NA      !NA
## 36     2.61   29.13   0.12 319    Very Poor          !NA     !NA      !NA
## 37     4.06   14.67   3.07 308    Very Poor          !NA     !NA      !NA
## 38     4.15   24.65   6.71 402       Severe          !NA     !NA      !NA
## 39     8.40   63.70   3.90 403       Severe          !NA     !NA      !NA
## 40     2.63   17.28   3.28 242         Poor          !NA     !NA      !NA
## 41     4.86   40.12   2.04 249         Poor          !NA     !NA      !NA
## 42     1.10   18.50   0.00 199     Moderate          !NA     !NA      !NA
## 43     2.51   14.36   2.14 221         Poor          !NA     !NA      !NA
## 44     1.85    8.56   0.27 352    Very Poor          !NA     !NA      !NA
## 45     2.72   14.48   2.19 351    Very Poor          !NA     !NA      !NA
## 46     2.71    9.38   1.37 389    Very Poor          !NA     !NA      !NA
## 47     4.36   18.49   4.56 361    Very Poor          !NA     !NA      !NA
## 48     0.44    1.27   0.02 171     Moderate          !NA     !NA      !NA
## 49     1.76    7.62   0.55 163     Moderate          !NA     !NA      !NA
## 50     1.82   20.76   1.69 131     Moderate          !NA     !NA      !NA
## 51     5.92   28.73   2.15 201         Poor          !NA     !NA      !NA
## 52     1.53    6.43   0.47 292         Poor          !NA     !NA      !NA
## 53     4.18   22.69   2.07 329    Very Poor          !NA     !NA      !NA
## 54     0.58    7.17   0.09 148     Moderate          !NA     !NA      !NA
## 55     1.64   11.15   0.35 158     Moderate          !NA     !NA      !NA
## 56     1.76    8.57   0.11 200     Moderate          !NA     !NA      !NA
## 57     0.58    1.65   0.00 190     Moderate          !NA     !NA      !NA
## 58     0.94    9.92   0.03 339    Very Poor          !NA     !NA      !NA
## 59     3.03   10.49   0.58 321    Very Poor          !NA     !NA      !NA
## 60     1.13    7.45   0.01 346    Very Poor          !NA     !NA      !NA
## 61     3.58   13.83   0.69 344    Very Poor          !NA     !NA      !NA
## 62     3.70   20.42   4.85 380    Very Poor          !NA     !NA      !NA
## 63     3.72   15.44   3.03 366    Very Poor          !NA     !NA      !NA
## 64     0.74    3.54   0.14 267         Poor          !NA     !NA      !NA
## 65     0.86    2.85   0.15 249         Poor          !NA     !NA      !NA
## 66     2.51   12.12   0.64 122     Moderate          !NA     !NA      !NA
## 67     1.07   18.50   0.88 112     Moderate          !NA     !NA      !NA
## 68     0.49   21.40   0.09 133     Moderate          !NA     !NA      !NA
## 69     2.45    8.49   0.54 141     Moderate          !NA     !NA      !NA
## 70     2.72   10.82   1.51 338    Very Poor          !NA     !NA      !NA
## 71     1.80   10.75   1.74 366    Very Poor          !NA     !NA      !NA
## 72     0.84    4.52   0.02 297         Poor          !NA     !NA      !NA
## 73     2.74    9.93   0.98 310    Very Poor          !NA     !NA      !NA
## 74     1.97    7.96   0.37 319    Very Poor          !NA     !NA      !NA
## 75     0.50    8.24   0.00 291         Poor          !NA     !NA      !NA
## 76     0.71    3.79   0.09 245         Poor          !NA     !NA      !NA
## 77     2.91   13.07   1.39 215         Poor          !NA     !NA      !NA
## 78     3.69   19.43   2.45 322    Very Poor          !NA     !NA      !NA
## 79     0.40    1.12   0.00 281         Poor          !NA     !NA      !NA
## 80     0.31    1.51   0.00 119     Moderate          !NA     !NA      !NA
## 81     1.75   10.35   1.09 147     Moderate          !NA     !NA      !NA
## 82     1.73    7.28   1.01 161     Moderate          !NA     !NA      !NA
## 83     0.94    5.34   1.44 146     Moderate          !NA     !NA      !NA
## 84     0.21    0.72   0.00 119     Moderate          !NA     !NA      !NA
## 85     1.08    5.51   0.21 127     Moderate          !NA     !NA      !NA
## 86     0.17    1.01   0.00  77 Satisfactory          !NA     !NA      !NA
## 87     1.30    3.30   0.00 185     Moderate          !NA     !NA      !NA
## 88     1.06    6.11   0.21 250         Poor          !NA     !NA      !NA
## 89     0.68    2.66   0.20 212         Poor          !NA     !NA      !NA
## 90     1.32    3.52   0.09 149     Moderate          !NA     !NA      !NA
## 91     0.58    7.68   0.05 146     Moderate          !NA     !NA      !NA
## 92     2.57   10.15   0.90 212         Poor          !NA     !NA      !NA
## 93     0.51   27.04   0.13 135     Moderate          !NA     !NA      !NA
## 94     1.78    4.76   0.13 170     Moderate          !NA     !NA      !NA
## 95     0.68    3.98   0.04 153     Moderate          !NA     !NA      !NA
## 96     1.77    8.04   0.16 156     Moderate          !NA     !NA      !NA
## 97     2.41   13.08   0.40 181     Moderate          !NA     !NA      !NA
## 98     1.09    5.54   0.00 146     Moderate          !NA     !NA      !NA
## 99     0.54    9.44   0.00 102     Moderate          !NA     !NA      !NA
## 100    1.71    8.43   0.13 135     Moderate          !NA     !NA      !NA
## 101    1.55    5.94   0.14 179     Moderate          !NA     !NA      !NA
## 102    1.12    4.88   0.09 155     Moderate          !NA     !NA      !NA
## 103    3.87   18.15   1.36 224         Poor          !NA     !NA      !NA
## 104    5.04   17.64   1.95 291         Poor          !NA     !NA      !NA
## 105    1.91   11.61   0.68 194     Moderate          !NA     !NA      !NA
## 106    1.43    6.93   0.30 158     Moderate          !NA     !NA      !NA
## 107    1.13    9.78   0.25 146     Moderate          !NA     !NA      !NA
## 108    1.38    6.55   0.61 178     Moderate          !NA     !NA      !NA
## 109    0.30    1.84   0.01 104     Moderate          !NA     !NA      !NA
## 110    1.46    5.31   0.11  91 Satisfactory          !NA     !NA      !NA
## 111    2.07   17.25   0.70 127     Moderate          !NA     !NA      !NA
## 112    1.56    3.40   0.05 252         Poor          !NA     !NA      !NA
## 113    3.24   14.85   0.97 203         Poor          !NA     !NA      !NA
## 114    1.57    5.89   0.09 278         Poor          !NA     !NA      !NA
## 115    1.20    5.09   0.16 194     Moderate          !NA     !NA      !NA
## 116    5.45   18.65   1.65 174     Moderate          !NA     !NA      !NA
## 117    3.56   15.63   1.30 172     Moderate          !NA     !NA      !NA
## 118    2.67   13.55   1.74 243         Poor          !NA     !NA      !NA
## 119    1.28    4.89   0.06 183     Moderate          !NA     !NA      !NA
## 120    1.14    4.97   0.18 159     Moderate          !NA     !NA      !NA
## 121    3.31   12.55   3.59 139     Moderate          !NA     !NA      !NA
## 122    2.30    5.44   0.25 117     Moderate          !NA     !NA      !NA
## 123    0.09    0.45   0.00 221         Poor          !NA     !NA      !NA
## 124    0.28    1.13   0.02 134     Moderate          !NA     !NA      !NA
## 125    0.75    5.62   0.27 115     Moderate          !NA     !NA      !NA
## 126    1.16    7.35   0.53 159     Moderate          !NA     !NA      !NA
## 127    2.88    7.55   3.09 153     Moderate          !NA     !NA      !NA
## 128    1.25    4.33   0.31 113     Moderate          !NA     !NA      !NA
## 129    1.86   19.13   2.46  76 Satisfactory          !NA     !NA      !NA
## 130    1.61    9.64   0.57 123     Moderate          !NA     !NA      !NA
## 131    1.60   15.18   1.18  79 Satisfactory          !NA     !NA      !NA
## 132    0.52    2.24   0.00 108     Moderate          !NA     !NA      !NA
## 133    1.23   14.38   0.50  76 Satisfactory          !NA     !NA      !NA
## 134    0.75    3.90   0.04 100 Satisfactory          !NA     !NA      !NA
## 135    1.21   16.22   0.61 112     Moderate          !NA     !NA      !NA
## 136    0.76    3.66   0.11 130     Moderate          !NA     !NA      !NA
## 137    0.25   27.44   0.43  68 Satisfactory          !NA     !NA      !NA
## 138    1.03    6.48   0.10 103     Moderate          !NA     !NA      !NA
## 139    0.58   22.91   1.12  77 Satisfactory          !NA     !NA      !NA
## 140    0.57    5.23   0.12  88 Satisfactory          !NA     !NA      !NA
## 141    1.09   28.35   1.73 154     Moderate          !NA     !NA      !NA
## 142    1.46    8.45   0.88 108     Moderate          !NA     !NA      !NA
## 143    0.29   17.95   1.00 114     Moderate          !NA     !NA      !NA
## 144    1.55    7.70   0.60 132     Moderate          !NA     !NA      !NA
## 145    0.23   16.80   0.74 128     Moderate          !NA     !NA      !NA
## 146    0.88    7.71   0.24 123     Moderate          !NA     !NA      !NA
## 147    1.75    6.83   0.87 106     Moderate          !NA     !NA      !NA
## 148    0.19   11.27   0.45 101     Moderate          !NA     !NA      !NA
## 149    1.87   13.82   1.00 179     Moderate          !NA     !NA      !NA
## 150    0.03    7.95   0.25 128     Moderate          !NA     !NA      !NA
## 151    0.40   24.55   0.49 122     Moderate          !NA     !NA      !NA
## 152    1.28    7.79   0.99 194     Moderate          !NA     !NA      !NA
## 153    0.11   11.26   0.57  81 Satisfactory          !NA     !NA      !NA
## 154    0.35    2.89   0.15  98 Satisfactory          !NA     !NA      !NA
## 155    0.67   15.19   1.02  63 Satisfactory          !NA     !NA      !NA
## 156    0.90    5.47   0.28  92 Satisfactory          !NA     !NA      !NA
## 157    0.79    4.00   0.41  78 Satisfactory          !NA     !NA      !NA
## 158    0.55   10.18   0.60  59 Satisfactory          !NA     !NA      !NA
## 159    0.43    7.97   0.51  40         Good          !NA     !NA      !NA
## 160    0.13    1.31   0.01  81 Satisfactory          !NA     !NA      !NA
## 161    0.19    1.17   0.00  72 Satisfactory          !NA     !NA      !NA
## 162    0.52    7.64   0.30  42         Good          !NA     !NA      !NA
## 163    0.51    6.68   0.48  47         Good          !NA     !NA      !NA
## 164    0.67    2.92   0.11  79 Satisfactory          !NA     !NA      !NA
## 165    0.25    1.65   0.01  88 Satisfactory          !NA     !NA      !NA
## 166    0.48    6.22   0.27  45         Good          !NA     !NA      !NA
## 167    0.17   18.14   0.19  93 Satisfactory          !NA     !NA      !NA
## 168    1.74    6.99   2.46  95 Satisfactory          !NA     !NA      !NA
## 169    0.08    8.63   0.89  49         Good          !NA     !NA      !NA
## 170    1.51   11.10   0.91  84 Satisfactory          !NA     !NA      !NA
## 171    0.81    7.63   0.24  88 Satisfactory          !NA     !NA      !NA
## 172    0.19    7.54   0.44  57 Satisfactory          !NA     !NA      !NA
## 173    0.03    8.23   0.45 105     Moderate          !NA     !NA      !NA
## 174    0.30    0.97   0.02  76 Satisfactory          !NA     !NA      !NA
## 175    0.05   14.82   1.01 109     Moderate          !NA     !NA      !NA
## 176    0.23    1.76   0.00  76 Satisfactory          !NA     !NA      !NA
## 177    0.17    1.04   0.00  75 Satisfactory          !NA     !NA      !NA
## 178    0.02   15.16   0.98 101     Moderate          !NA     !NA      !NA
## 179    0.76    3.64   0.34  75 Satisfactory          !NA     !NA      !NA
## 180    0.06   13.62   1.39  92 Satisfactory          !NA     !NA      !NA
## 181    1.14    7.20   0.67  84 Satisfactory          !NA     !NA      !NA
## 182    0.19   19.90   1.27  93 Satisfactory          !NA     !NA      !NA
## 183    0.83    8.12   0.48  98 Satisfactory          !NA     !NA      !NA
## 184    0.44   16.33   1.01  51 Satisfactory          !NA     !NA      !NA
## 185    0.11    5.80   0.10  42         Good          !NA     !NA      !NA
## 186    0.24    1.57   0.06  72 Satisfactory          !NA     !NA      !NA
## 187    0.10    1.03   0.00  59 Satisfactory          !NA     !NA      !NA
## 188    0.21    8.30   0.54  43         Good          !NA     !NA      !NA
## 189    0.37    3.29   0.12  51 Satisfactory          !NA     !NA      !NA
## 190    0.25   15.44   0.61  52 Satisfactory          !NA     !NA      !NA
## 191    1.60   10.31   1.19  92 Satisfactory          !NA     !NA      !NA
## 192    0.73   16.48   1.80  64 Satisfactory          !NA     !NA      !NA
## 193    1.65    8.65   0.67 102     Moderate          !NA     !NA      !NA
## 194    2.31   11.46   3.52  61 Satisfactory          !NA     !NA      !NA
## 195    1.07    5.21   0.23 111     Moderate          !NA     !NA      !NA
## 196    6.91   11.97   9.88  87 Satisfactory          !NA     !NA      !NA
## 197    1.31    8.24   0.75 118     Moderate          !NA     !NA      !NA
## 198    1.41   13.92   2.54 117     Moderate          !NA     !NA      !NA
## 199    0.83    7.35   1.15  92 Satisfactory          !NA     !NA      !NA
## 200    0.97    7.48   0.43 102     Moderate          !NA     !NA      !NA
## 201    0.84    7.97   0.35  74 Satisfactory          !NA     !NA      !NA
## 202    0.38    5.38   0.69  51 Satisfactory          !NA     !NA      !NA
## 203    0.51    5.42   0.60  52 Satisfactory          !NA     !NA      !NA
## 204    1.14    7.84   0.59  87 Satisfactory          !NA     !NA      !NA
## 205    0.45    3.29   0.05  86 Satisfactory          !NA     !NA      !NA
## 206    0.20    9.26   0.62  46         Good          !NA     !NA      !NA
## 207    1.14    5.79   0.41 101     Moderate          !NA     !NA      !NA
## 208    0.16   19.63   2.29  80 Satisfactory          !NA     !NA      !NA
## 209    1.44    7.47   0.48 124     Moderate          !NA     !NA      !NA
## 210    0.55   18.39   1.69 106     Moderate          !NA     !NA      !NA
## 211    1.12   18.21   1.68  87 Satisfactory          !NA     !NA      !NA
## 212    0.44    2.70   0.08 111     Moderate          !NA     !NA      !NA
## 213    0.24    1.86   0.05  88 Satisfactory          !NA     !NA      !NA
## 214    0.59   15.64   0.71  56 Satisfactory          !NA     !NA      !NA
## 215    0.43    3.65   0.14  84 Satisfactory          !NA     !NA      !NA
## 216    1.57   19.25   1.10  91 Satisfactory          !NA     !NA      !NA
## 217    1.21   25.63   4.40 101     Moderate          !NA     !NA      !NA
## 218    0.65    3.40   0.29  91 Satisfactory          !NA     !NA      !NA
## 219    0.58    4.69   0.24  79 Satisfactory          !NA     !NA      !NA
## 220    2.40   22.63   4.60  80 Satisfactory          !NA     !NA      !NA
## 221    1.49   14.11   2.01  62 Satisfactory          !NA     !NA      !NA
## 222    0.85    4.85   0.29  86 Satisfactory          !NA     !NA      !NA
## 223    2.39   30.38   5.22  63 Satisfactory          !NA     !NA      !NA
## 224    0.82    5.98   0.52 104     Moderate          !NA     !NA      !NA
## 225    0.38    2.94   0.19  41         Good          !NA     !NA      !NA
## 226    0.26    2.43   0.04  60 Satisfactory          !NA     !NA      !NA
## 227    0.65    6.37   0.76  76 Satisfactory          !NA     !NA      !NA
## 228    0.74    3.99   0.27  60 Satisfactory          !NA     !NA      !NA
## 229    0.89    9.49   1.54  81 Satisfactory          !NA     !NA      !NA
## 230    0.54    3.09   0.11  86 Satisfactory          !NA     !NA      !NA
## 231    0.56    6.29   1.50  74 Satisfactory          !NA     !NA      !NA
## 232    0.58    5.08   0.63  74 Satisfactory          !NA     !NA      !NA
## 233    0.42    6.34   0.57  89 Satisfactory          !NA     !NA      !NA
## 234    1.46    7.39   0.24  90 Satisfactory          !NA     !NA      !NA
## 235    1.61    8.07   0.58  89 Satisfactory          !NA     !NA      !NA
## 236    0.43    4.77   0.72  76 Satisfactory          !NA     !NA      !NA
## 237    0.75   11.90   0.81 170     Moderate          !NA     !NA      !NA
## 238    2.43   13.59   0.91 143     Moderate          !NA     !NA      !NA
## 239    2.90   18.12   3.82 668       Severe          !NA     !NA      !NA
## 240    5.22   16.70   1.62 439       Severe          !NA     !NA      !NA
## 241    1.67   28.89   1.98 692       Severe          !NA     !NA      !NA
## 242    4.58   16.41   2.30 404       Severe          !NA     !NA      !NA
## 243    5.64   20.27   2.79 262         Poor          !NA     !NA      !NA
## 244    1.43    3.22   0.57 224         Poor          !NA     !NA      !NA
## 245    0.23    1.23   0.09  80 Satisfactory          !NA     !NA      !NA
## 246    1.33    4.66   0.10  93 Satisfactory          !NA     !NA      !NA
## 247    2.66   13.04   1.44  66 Satisfactory          !NA     !NA      !NA
## 248    0.66   11.26   0.63  62 Satisfactory          !NA     !NA      !NA
## 249    0.80    3.51   0.33 328    Very Poor          !NA     !NA      !NA
## 250    1.74    5.76   0.13 318    Very Poor          !NA     !NA      !NA
## 251    3.14   15.14   0.87 156     Moderate          !NA     !NA      !NA
## 252    0.64    3.74   0.35 152     Moderate          !NA     !NA      !NA
## 253    0.83   10.23   0.33 141     Moderate          !NA     !NA      !NA
## 254    2.54   12.27   0.51 174     Moderate          !NA     !NA      !NA
## 255    0.88    2.85   0.18 270         Poor          !NA     !NA      !NA
## 256    2.75   11.82   0.61 302    Very Poor          !NA     !NA      !NA
## 257    0.80    3.72   0.28 212         Poor          !NA     !NA      !NA
## 258    1.41    5.17   0.09 237         Poor          !NA     !NA      !NA
## 259    0.58    3.87   0.08 159     Moderate          !NA     !NA      !NA
## 260    0.94   23.62   0.66 237         Poor          !NA     !NA      !NA
## 261    3.62   13.90   0.83 248         Poor          !NA     !NA      !NA
## 262    1.14    5.77   0.61 282         Poor          !NA     !NA      !NA
## 263    2.60   19.11   0.22 280         Poor          !NA     !NA      !NA
## 264    1.00    8.24   0.63 284         Poor          !NA     !NA      !NA
## 265    3.21   13.55   0.37 269         Poor          !NA     !NA      !NA
## 266    1.27   11.09   0.58 204         Poor          !NA     !NA      !NA
## 267    3.86   17.94   0.53 238         Poor          !NA     !NA      !NA
##     PM10_NA NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA O3_NA Benzene_NA Toluene_NA
## 1       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 2       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 3       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 4       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 5       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 6       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 7       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 8       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 9       !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 10      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 11      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 12      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 13      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 14      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 15      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 16      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 17      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 18      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 19      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 20      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 21      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 22      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 23      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 24      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 25      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 26      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 27      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 28      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 29      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 30      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 31      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 32      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 33      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 34      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 35      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 36      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 37      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 38      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 39      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 40      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 41      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 42      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 43      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 44      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 45      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 46      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 47      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 48      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 49      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 50      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 51      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 52      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 53      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 54      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 55      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 56      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 57      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 58      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 59      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 60      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 61      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 62      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 63      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 64      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 65      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 66      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 67      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 68      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 69      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 70      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 71      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 72      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 73      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 74      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 75      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 76      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 77      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 78      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 79      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 80      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 81      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 82      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 83      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 84      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 85      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 86      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 87      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 88      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 89      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 90      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 91      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 92      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 93      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 94      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 95      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 96      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 97      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 98      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 99      !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 100     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 101     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 102     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 103     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 104     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 105     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 106     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 107     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 108     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 109     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 110     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 111     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 112     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 113     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 114     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 115     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 116     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 117     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 118     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 119     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 120     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 121     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 122     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 123     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 124     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 125     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 126     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 127     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 128     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 129     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 130     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 131     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 132     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 133     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 134     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 135     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 136     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 137     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 138     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 139     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 140     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 141     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 142     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 143     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 144     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 145     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 146     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 147     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 148     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 149     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 150     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 151     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 152     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 153     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 154     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 155     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 156     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 157     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 158     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 159     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 160     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 161     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 162     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 163     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 164     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 165     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 166     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 167     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 168     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 169     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 170     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 171     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 172     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 173     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 174     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 175     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 176     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 177     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 178     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 179     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 180     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 181     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 182     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 183     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 184     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 185     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 186     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 187     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 188     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 189     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 190     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 191     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 192     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 193     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 194     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 195     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 196     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 197     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 198     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 199     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 200     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 201     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 202     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 203     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 204     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 205     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 206     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 207     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 208     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 209     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 210     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 211     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 212     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 213     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 214     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 215     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 216     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 217     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 218     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 219     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 220     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 221     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 222     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 223     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 224     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 225     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 226     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 227     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 228     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 229     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 230     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 231     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 232     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 233     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 234     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 235     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 236     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 237     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 238     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 239     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 240     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 241     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 242     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 243     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 244     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 245     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 246     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 247     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 248     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 249     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 250     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 251     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 252     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 253     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 254     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 255     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 256     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 257     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 258     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 259     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 260     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 261     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 262     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 263     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 264     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 265     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 266     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
## 267     !NA   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA
##     Xylene_NA AQI_NA AQI_Bucket_NA Daily_Delay Month
## 1         !NA    !NA           !NA          42     6
## 2         !NA    !NA           !NA         190     6
## 3         !NA    !NA           !NA          66     6
## 4         !NA    !NA           !NA          30     6
## 5         !NA    !NA           !NA         120     7
## 6         !NA    !NA           !NA          31     7
## 7         !NA    !NA           !NA           8     7
## 8         !NA    !NA           !NA          40     7
## 9         !NA    !NA           !NA          58     7
## 10        !NA    !NA           !NA          32     7
## 11        !NA    !NA           !NA          43     7
## 12        !NA    !NA           !NA          48     7
## 13        !NA    !NA           !NA          31     7
## 14        !NA    !NA           !NA          40     7
## 15        !NA    !NA           !NA          59     7
## 16        !NA    !NA           !NA         140     7
## 17        !NA    !NA           !NA          45     7
## 18        !NA    !NA           !NA          62     7
## 19        !NA    !NA           !NA          50     7
## 20        !NA    !NA           !NA         100     7
## 21        !NA    !NA           !NA          61     7
## 22        !NA    !NA           !NA          50     7
## 23        !NA    !NA           !NA         109     7
## 24        !NA    !NA           !NA          42     7
## 25        !NA    !NA           !NA          33     7
## 26        !NA    !NA           !NA          40     8
## 27        !NA    !NA           !NA          37     8
## 28        !NA    !NA           !NA          62     8
## 29        !NA    !NA           !NA         105     8
## 30        !NA    !NA           !NA          35    11
## 31        !NA    !NA           !NA          23    11
## 32        !NA    !NA           !NA          58    11
## 33        !NA    !NA           !NA          58    11
## 34        !NA    !NA           !NA          60    11
## 35        !NA    !NA           !NA          60    11
## 36        !NA    !NA           !NA          46    11
## 37        !NA    !NA           !NA          46    11
## 38        !NA    !NA           !NA          63    12
## 39        !NA    !NA           !NA          63    12
## 40        !NA    !NA           !NA         106    12
## 41        !NA    !NA           !NA         106    12
## 42        !NA    !NA           !NA         145    12
## 43        !NA    !NA           !NA         145    12
## 44        !NA    !NA           !NA          34     1
## 45        !NA    !NA           !NA          34     1
## 46        !NA    !NA           !NA          31     1
## 47        !NA    !NA           !NA          31     1
## 48        !NA    !NA           !NA          36     1
## 49        !NA    !NA           !NA          36     1
## 50        !NA    !NA           !NA          66     1
## 51        !NA    !NA           !NA          66     1
## 52        !NA    !NA           !NA          45     1
## 53        !NA    !NA           !NA          45     1
## 54        !NA    !NA           !NA          45     1
## 55        !NA    !NA           !NA          45     1
## 56        !NA    !NA           !NA          56     1
## 57        !NA    !NA           !NA          56     1
## 58        !NA    !NA           !NA          52     2
## 59        !NA    !NA           !NA          52     2
## 60        !NA    !NA           !NA         189     2
## 61        !NA    !NA           !NA         189     2
## 62        !NA    !NA           !NA          43     2
## 63        !NA    !NA           !NA          43     2
## 64        !NA    !NA           !NA          63     2
## 65        !NA    !NA           !NA          63     2
## 66        !NA    !NA           !NA          98     2
## 67        !NA    !NA           !NA          98     2
## 68        !NA    !NA           !NA         207     2
## 69        !NA    !NA           !NA         207     2
## 70        !NA    !NA           !NA          67     2
## 71        !NA    !NA           !NA          67     2
## 72        !NA    !NA           !NA          43     2
## 73        !NA    !NA           !NA          43     2
## 74        !NA    !NA           !NA          34     2
## 75        !NA    !NA           !NA          34     2
## 76        !NA    !NA           !NA          70     2
## 77        !NA    !NA           !NA          70     2
## 78        !NA    !NA           !NA          29     2
## 79        !NA    !NA           !NA          29     2
## 80        !NA    !NA           !NA          54     2
## 81        !NA    !NA           !NA          54     2
## 82        !NA    !NA           !NA          65     2
## 83        !NA    !NA           !NA          65     2
## 84        !NA    !NA           !NA          43     2
## 85        !NA    !NA           !NA          43     2
## 86        !NA    !NA           !NA          34     2
## 87        !NA    !NA           !NA          57     3
## 88        !NA    !NA           !NA          57     3
## 89        !NA    !NA           !NA          57     3
## 90        !NA    !NA           !NA          57     3
## 91        !NA    !NA           !NA          33     3
## 92        !NA    !NA           !NA          33     3
## 93        !NA    !NA           !NA         184     3
## 94        !NA    !NA           !NA         184     3
## 95        !NA    !NA           !NA          27     3
## 96        !NA    !NA           !NA          27     3
## 97        !NA    !NA           !NA         104     3
## 98        !NA    !NA           !NA         104     3
## 99        !NA    !NA           !NA          33     3
## 100       !NA    !NA           !NA          33     3
## 101       !NA    !NA           !NA          56     3
## 102       !NA    !NA           !NA          41     4
## 103       !NA    !NA           !NA          62     4
## 104       !NA    !NA           !NA          38     4
## 105       !NA    !NA           !NA          27     4
## 106       !NA    !NA           !NA          40     4
## 107       !NA    !NA           !NA          40     4
## 108       !NA    !NA           !NA          93     4
## 109       !NA    !NA           !NA          68     4
## 110       !NA    !NA           !NA          52     4
## 111       !NA    !NA           !NA          64     4
## 112       !NA    !NA           !NA          51     5
## 113       !NA    !NA           !NA          72     5
## 114       !NA    !NA           !NA          29     5
## 115       !NA    !NA           !NA          24     5
## 116       !NA    !NA           !NA          41     5
## 117       !NA    !NA           !NA          32     5
## 118       !NA    !NA           !NA          34     5
## 119       !NA    !NA           !NA          90     5
## 120       !NA    !NA           !NA          45     5
## 121       !NA    !NA           !NA          22     5
## 122       !NA    !NA           !NA          41     5
## 123       !NA    !NA           !NA          87     6
## 124       !NA    !NA           !NA          63     6
## 125       !NA    !NA           !NA          76     6
## 126       !NA    !NA           !NA          80     6
## 127       !NA    !NA           !NA          82     6
## 128       !NA    !NA           !NA          56     7
## 129       !NA    !NA           !NA          50     7
## 130       !NA    !NA           !NA          50     7
## 131       !NA    !NA           !NA          47     7
## 132       !NA    !NA           !NA          47     7
## 133       !NA    !NA           !NA          55     7
## 134       !NA    !NA           !NA          55     7
## 135       !NA    !NA           !NA         170     7
## 136       !NA    !NA           !NA         170     7
## 137       !NA    !NA           !NA          46     7
## 138       !NA    !NA           !NA          46     7
## 139       !NA    !NA           !NA         159     7
## 140       !NA    !NA           !NA         159     7
## 141       !NA    !NA           !NA          97     7
## 142       !NA    !NA           !NA          97     7
## 143       !NA    !NA           !NA          56     7
## 144       !NA    !NA           !NA          56     7
## 145       !NA    !NA           !NA         313     7
## 146       !NA    !NA           !NA         313     7
## 147       !NA    !NA           !NA          32     7
## 148       !NA    !NA           !NA          32     7
## 149       !NA    !NA           !NA          65     7
## 150       !NA    !NA           !NA          65     7
## 151       !NA    !NA           !NA          44     7
## 152       !NA    !NA           !NA          44     7
## 153       !NA    !NA           !NA          25     7
## 154       !NA    !NA           !NA          25     7
## 155       !NA    !NA           !NA          29     7
## 156       !NA    !NA           !NA          29     7
## 157       !NA    !NA           !NA          78     7
## 158       !NA    !NA           !NA          78     7
## 159       !NA    !NA           !NA         126     7
## 160       !NA    !NA           !NA         126     7
## 161       !NA    !NA           !NA          41     7
## 162       !NA    !NA           !NA          41     7
## 163       !NA    !NA           !NA          68     7
## 164       !NA    !NA           !NA          68     7
## 165       !NA    !NA           !NA          42     7
## 166       !NA    !NA           !NA          42     7
## 167       !NA    !NA           !NA          61     8
## 168       !NA    !NA           !NA          61     8
## 169       !NA    !NA           !NA          45     8
## 170       !NA    !NA           !NA          45     8
## 171       !NA    !NA           !NA         113     8
## 172       !NA    !NA           !NA         113     8
## 173       !NA    !NA           !NA          53     8
## 174       !NA    !NA           !NA          53     8
## 175       !NA    !NA           !NA          48     8
## 176       !NA    !NA           !NA          48     8
## 177       !NA    !NA           !NA          85     8
## 178       !NA    !NA           !NA          85     8
## 179       !NA    !NA           !NA          38     8
## 180       !NA    !NA           !NA          38     8
## 181       !NA    !NA           !NA          87     8
## 182       !NA    !NA           !NA          87     8
## 183       !NA    !NA           !NA         231     8
## 184       !NA    !NA           !NA         231     8
## 185       !NA    !NA           !NA         110     8
## 186       !NA    !NA           !NA         110     8
## 187       !NA    !NA           !NA          38     8
## 188       !NA    !NA           !NA          38     8
## 189       !NA    !NA           !NA         214     8
## 190       !NA    !NA           !NA         214     8
## 191       !NA    !NA           !NA          54     8
## 192       !NA    !NA           !NA          54     8
## 193       !NA    !NA           !NA          91     8
## 194       !NA    !NA           !NA          91     8
## 195       !NA    !NA           !NA          60     8
## 196       !NA    !NA           !NA          60     8
## 197       !NA    !NA           !NA          45     8
## 198       !NA    !NA           !NA          45     8
## 199       !NA    !NA           !NA          59     8
## 200       !NA    !NA           !NA          59     8
## 201       !NA    !NA           !NA         111     8
## 202       !NA    !NA           !NA         111     8
## 203       !NA    !NA           !NA         129     8
## 204       !NA    !NA           !NA         129     8
## 205       !NA    !NA           !NA          59     9
## 206       !NA    !NA           !NA          59     9
## 207       !NA    !NA           !NA          15     9
## 208       !NA    !NA           !NA          15     9
## 209       !NA    !NA           !NA          73     9
## 210       !NA    !NA           !NA          73     9
## 211       !NA    !NA           !NA          43     9
## 212       !NA    !NA           !NA          43     9
## 213       !NA    !NA           !NA          41     9
## 214       !NA    !NA           !NA          41     9
## 215       !NA    !NA           !NA          50     9
## 216       !NA    !NA           !NA          50     9
## 217       !NA    !NA           !NA          74     9
## 218       !NA    !NA           !NA          74     9
## 219       !NA    !NA           !NA          50     9
## 220       !NA    !NA           !NA          50     9
## 221       !NA    !NA           !NA          83     9
## 222       !NA    !NA           !NA          83     9
## 223       !NA    !NA           !NA          36     9
## 224       !NA    !NA           !NA          36     9
## 225       !NA    !NA           !NA          35     9
## 226       !NA    !NA           !NA          35     9
## 227       !NA    !NA           !NA          72     9
## 228       !NA    !NA           !NA          72     9
## 229       !NA    !NA           !NA          56    10
## 230       !NA    !NA           !NA          56    10
## 231       !NA    !NA           !NA          45    10
## 232       !NA    !NA           !NA          66    10
## 233       !NA    !NA           !NA          90    10
## 234       !NA    !NA           !NA          90    10
## 235       !NA    !NA           !NA         205    10
## 236       !NA    !NA           !NA         205    10
## 237       !NA    !NA           !NA          72    10
## 238       !NA    !NA           !NA          72    10
## 239       !NA    !NA           !NA          65    11
## 240       !NA    !NA           !NA          65    11
## 241       !NA    !NA           !NA         279    11
## 242       !NA    !NA           !NA         279    11
## 243       !NA    !NA           !NA          69    11
## 244       !NA    !NA           !NA          69    11
## 245       !NA    !NA           !NA          50    11
## 246       !NA    !NA           !NA          50    11
## 247       !NA    !NA           !NA          74    11
## 248       !NA    !NA           !NA          74    11
## 249       !NA    !NA           !NA         420    12
## 250       !NA    !NA           !NA         420    12
## 251       !NA    !NA           !NA          98    12
## 252       !NA    !NA           !NA          98    12
## 253       !NA    !NA           !NA          42    12
## 254       !NA    !NA           !NA          42    12
## 255       !NA    !NA           !NA         157     1
## 256       !NA    !NA           !NA          57     1
## 257       !NA    !NA           !NA          57     1
## 258       !NA    !NA           !NA          20     1
## 259       !NA    !NA           !NA          20     1
## 260       !NA    !NA           !NA         116     1
## 261       !NA    !NA           !NA         116     1
## 262       !NA    !NA           !NA          26     1
## 263       !NA    !NA           !NA          26     1
## 264       !NA    !NA           !NA          48     1
## 265       !NA    !NA           !NA          48     1
## 266       !NA    !NA           !NA          45     1
## 267       !NA    !NA           !NA          45     1
head(Delhi_cohesive_dataset)
##         Date any_missing tavg tmin tmax prcp time_NA tavg_NA tmin_NA tmax_NA
## 1 2018-06-27 Not Missing 30.3 26.2 37.5  3.0     !NA     !NA     !NA     !NA
## 2 2018-06-28 Not Missing 29.9 24.2 37.5 20.1     !NA     !NA     !NA     !NA
## 3 2018-06-29 Not Missing 30.7 27.9 35.2  1.0     !NA     !NA     !NA     !NA
## 4 2018-06-30 Not Missing 31.3 27.5 35.6  9.9     !NA     !NA     !NA     !NA
## 5 2018-07-04 Not Missing 31.7 26.1 36.7  5.1     !NA     !NA     !NA     !NA
## 6 2018-07-06 Not Missing 32.9 28.1 37.3  5.1     !NA     !NA     !NA     !NA
##   prcp_NA StationId PM2.5   PM10    NO   NO2   NOx   NH3   CO   SO2    O3
## 1     !NA     DL019 48.03  89.10  4.09 39.86 24.30 18.91 0.68 12.71  9.14
## 2     !NA     DL019 23.98  38.46  3.64 34.88 21.51 26.11 0.52 11.41  6.42
## 3     !NA     DL019 34.77  60.62 11.30 53.24 37.53 38.76 0.75  9.87 11.59
## 4     !NA     DL019 42.65 113.91  5.90 50.46 31.40 21.05 0.91 12.43 10.90
## 5     !NA     DL019 44.09 138.82  2.30 34.02 19.90 27.99 0.56  7.27 13.37
## 6     !NA     DL019 48.80 110.93  8.11 29.32 22.19 35.32 0.53 10.14 15.38
##   Benzene Toluene Xylene AQI   AQI_Bucket StationId_NA Date_NA PM2.5_NA PM10_NA
## 1    1.74   11.65   1.60  80 Satisfactory          !NA     !NA      !NA     !NA
## 2    2.10    8.95   1.23  55 Satisfactory          !NA     !NA      !NA     !NA
## 3   10.66   11.07   4.41  59 Satisfactory          !NA     !NA      !NA     !NA
## 4    8.03   15.18   4.50  92 Satisfactory          !NA     !NA      !NA     !NA
## 5    3.03   10.98   1.84 152     Moderate          !NA     !NA      !NA     !NA
## 6    3.08   16.27   2.48 104     Moderate          !NA     !NA      !NA     !NA
##   NO_NA NO2_NA NOx_NA NH3_NA CO_NA SO2_NA O3_NA Benzene_NA Toluene_NA Xylene_NA
## 1   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA       !NA
## 2   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA       !NA
## 3   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA       !NA
## 4   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA       !NA
## 5   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA       !NA
## 6   !NA    !NA    !NA    !NA   !NA    !NA   !NA        !NA        !NA       !NA
##   AQI_NA AQI_Bucket_NA Daily_Delay Month
## 1    !NA           !NA          42     6
## 2    !NA           !NA         190     6
## 3    !NA           !NA          66     6
## 4    !NA           !NA          30     6
## 5    !NA           !NA         120     7
## 6    !NA           !NA          31     7
#Looks like the merger is successful with no NA
#Lets summarize full dataset
summary(Delhi_cohesive_dataset)
##       Date            any_missing             tavg            tmin      
##  Min.   :2018-06-27   Length:267         Min.   :10.40   Min.   : 5.30  
##  1st Qu.:2019-02-08   Class :character   1st Qu.:18.30   1st Qu.:12.80  
##  Median :2019-07-07   Mode  :character   Median :28.20   Median :23.80  
##  Mean   :2019-05-16                      Mean   :25.29   Mean   :20.24  
##  3rd Qu.:2019-08-25                      3rd Qu.:30.90   3rd Qu.:26.20  
##  Max.   :2020-01-18                      Max.   :35.60   Max.   :28.80  
##       tmax            prcp        time_NA   tavg_NA   tmin_NA   tmax_NA  
##  Min.   :14.60   Min.   : 0.000   !NA:267   !NA:267   !NA:267   !NA:267  
##  1st Qu.:25.30   1st Qu.: 0.000   NA :  0   NA :  0   NA :  0   NA :  0  
##  Median :33.80   Median : 0.500                                          
##  Mean   :30.94   Mean   : 5.285                                          
##  3rd Qu.:36.00   3rd Qu.: 5.100                                          
##  Max.   :43.40   Max.   :70.100                                          
##  prcp_NA    StationId             PM2.5             PM10              NO       
##  !NA:267   Length:267         Min.   : 10.06   Min.   : 10.75   Min.   : 1.01  
##  NA :  0   Class :character   1st Qu.: 33.75   1st Qu.: 79.05   1st Qu.: 3.42  
##            Mode  :character   Median : 52.11   Median :113.91   Median : 7.12  
##                               Mean   : 69.21   Mean   :128.93   Mean   :11.75  
##                               3rd Qu.: 86.25   3rd Qu.:159.65   3rd Qu.:15.42  
##                               Max.   :734.56   Max.   :830.10   Max.   :67.89  
##       NO2              NOx             NH3              CO        
##  Min.   : 17.46   Min.   :10.37   Min.   : 6.79   Min.   :0.2800  
##  1st Qu.: 32.93   1st Qu.:22.20   1st Qu.:19.58   1st Qu.:0.6800  
##  Median : 41.93   Median :28.68   Median :24.64   Median :0.8100  
##  Mean   : 44.17   Mean   :32.92   Mean   :26.77   Mean   :0.8846  
##  3rd Qu.: 51.61   3rd Qu.:39.95   3rd Qu.:30.82   3rd Qu.:0.9900  
##  Max.   :112.54   Max.   :94.31   Max.   :72.95   Max.   :2.8000  
##       SO2               O3           Benzene          Toluene     
##  Min.   : 2.100   Min.   : 1.19   Min.   : 0.020   Min.   : 0.45  
##  1st Qu.: 5.065   1st Qu.:11.60   1st Qu.: 0.580   1st Qu.: 5.40  
##  Median : 9.930   Median :18.62   Median : 1.210   Median : 8.95  
##  Mean   :10.781   Mean   :19.66   Mean   : 1.667   Mean   :10.77  
##  3rd Qu.:12.450   3rd Qu.:24.61   3rd Qu.: 2.355   3rd Qu.:14.84  
##  Max.   :32.840   Max.   :74.89   Max.   :10.660   Max.   :63.70  
##      Xylene           AQI         AQI_Bucket        StationId_NA Date_NA  
##  Min.   :0.000   Min.   : 40.0   Length:267         !NA:267      !NA:267  
##  1st Qu.:0.180   1st Qu.: 86.5   Class :character   NA :  0      NA :  0  
##  Median :0.600   Median :119.0   Mode  :character                         
##  Mean   :1.113   Mean   :156.3                                            
##  3rd Qu.:1.600   3rd Qu.:202.0                                            
##  Max.   :9.880   Max.   :692.0                                            
##  PM2.5_NA  PM10_NA   NO_NA     NO2_NA    NOx_NA    NH3_NA    CO_NA    
##  !NA:267   !NA:267   !NA:267   !NA:267   !NA:267   !NA:267   !NA:267  
##  NA :  0   NA :  0   NA :  0   NA :  0   NA :  0   NA :  0   NA :  0  
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##  SO2_NA    O3_NA     Benzene_NA Toluene_NA Xylene_NA AQI_NA    AQI_Bucket_NA
##  !NA:267   !NA:267   !NA:267    !NA:267    !NA:267   !NA:267   !NA:267      
##  NA :  0   NA :  0   NA :  0    NA :  0    NA :  0   NA :  0   NA :  0      
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Daily_Delay         Month       
##  Min.   :  8.00   Min.   : 1.000  
##  1st Qu.: 42.00   1st Qu.: 3.000  
##  Median : 56.00   Median : 7.000  
##  Mean   : 73.72   Mean   : 6.341  
##  3rd Qu.: 81.00   3rd Qu.: 8.000  
##  Max.   :420.00   Max.   :12.000

Analyse cohesive dataset a bit to understand how delay and other parameters plot each other

names(Delhi_cohesive_dataset)
##  [1] "Date"          "any_missing"   "tavg"          "tmin"         
##  [5] "tmax"          "prcp"          "time_NA"       "tavg_NA"      
##  [9] "tmin_NA"       "tmax_NA"       "prcp_NA"       "StationId"    
## [13] "PM2.5"         "PM10"          "NO"            "NO2"          
## [17] "NOx"           "NH3"           "CO"            "SO2"          
## [21] "O3"            "Benzene"       "Toluene"       "Xylene"       
## [25] "AQI"           "AQI_Bucket"    "StationId_NA"  "Date_NA"      
## [29] "PM2.5_NA"      "PM10_NA"       "NO_NA"         "NO2_NA"       
## [33] "NOx_NA"        "NH3_NA"        "CO_NA"         "SO2_NA"       
## [37] "O3_NA"         "Benzene_NA"    "Toluene_NA"    "Xylene_NA"    
## [41] "AQI_NA"        "AQI_Bucket_NA" "Daily_Delay"   "Month"
ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, color = AQI_Bucket, size = prcp)) +
  geom_point() +
  labs(title = "Impact of AQI and prcp")

##As per the plot, Good AQI too gets observed for some delay cases but they are far and few... and does not seems to have caused high amount of delays 
## There area huge amount of delays caused for satisfactory AQI cases but most of the delays could be associated 
## with pretty high precipitation
## There are good amount of delays associate with moderate cases too and they do have caused significant delays when combined with high precipitations
## Delay instances reduces for Poor AQI cases but there is a slight increase in the values of delays
## For very poor cases, impact gets high when combined with precipitation
## Severe cases are high impact ones but looks like not affected with precipitation


## Now lets view this purely from the weather perspective
names(Delhi_cohesive_dataset)
##  [1] "Date"          "any_missing"   "tavg"          "tmin"         
##  [5] "tmax"          "prcp"          "time_NA"       "tavg_NA"      
##  [9] "tmin_NA"       "tmax_NA"       "prcp_NA"       "StationId"    
## [13] "PM2.5"         "PM10"          "NO"            "NO2"          
## [17] "NOx"           "NH3"           "CO"            "SO2"          
## [21] "O3"            "Benzene"       "Toluene"       "Xylene"       
## [25] "AQI"           "AQI_Bucket"    "StationId_NA"  "Date_NA"      
## [29] "PM2.5_NA"      "PM10_NA"       "NO_NA"         "NO2_NA"       
## [33] "NOx_NA"        "NH3_NA"        "CO_NA"         "SO2_NA"       
## [37] "O3_NA"         "Benzene_NA"    "Toluene_NA"    "Xylene_NA"    
## [41] "AQI_NA"        "AQI_Bucket_NA" "Daily_Delay"   "Month"
ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, color = tmin, size = prcp)) +
  geom_point() +
  labs(title = "Impact of temp and prcp")

## Its clear that bigger precipitation brings in more instances of delays
## But its also interesting to find that higher tavg, higher precipitation and higher tmin bring 
# in a lot of delays - though size of precipitation does not always result in costly delays

## Ok Lets also analyse if the components O3, PM2.5 and CO has impacts on delays

ggplot(Delhi_cohesive_dataset, aes(x = O3, y = Daily_Delay, size = O3)) +
  geom_point() +
  labs(title = "Impact of O3")

## Looks like more O3 directly relates to higher delays

ggplot(Delhi_cohesive_dataset, aes(x = PM2.5, y = Daily_Delay, size = PM2.5)) +
  geom_point() +
  labs(title = "Impact of PM2.5")

## Looks like more PM2.5 might not have too much impact...

ggplot(Delhi_cohesive_dataset, aes(x = CO, y = Daily_Delay, size = CO)) +
  geom_point() +
  labs(title = "Impact of CO")

## Looks like size of CO has some correlation but may not be linear...

ggplot(Delhi_cohesive_dataset, aes(x = PM10, y = Daily_Delay, size = PM10)) +
  geom_point() +
  labs(title = "Impact of PM10")

## Looks like more PM2.5 might not have too much impact...

ggplot(Delhi_cohesive_dataset, aes(x = prcp, y = Daily_Delay, size = prcp)) +
  geom_point() +
  labs(title = "Impact of rain")

## Looks like amount of rain has direct impact on delays...


ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, size = tavg)) +
  geom_point() +
  labs(title = "Impact of Average Temp")

## Looks like a lot of low intensity delays on higher average temprature...

ggplot(Delhi_cohesive_dataset, aes(x = tmin, y = Daily_Delay, size = tmin)) +
  geom_point() +
  labs(title = "Impact of Tmin")

## Looks like a lot of low intensity delays on higher average temprature...

ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, size = AQI, color=AQI_Bucket)) +
  geom_point() +
  labs(title = "Impact of AQI")

## Looks like a lot of low intensity delays on higher Tmin...

## Lets see if the months itself has any impact on the delay

ggplot(Delhi_cohesive_dataset, aes(x = Month, y = Daily_Delay, size = Daily_Delay)) +
  geom_point()+ scale_x_continuous(breaks=seq(1, 12, by = 1))+
  labs(title = "Impact of Month")

## Looks like there is high frequency of delays during monsoon and heavy delay during peak winter season


## Ok based on this, lets pick these elements to find the right model on impacts the delays of Delhi airtraffic:
## Precipitation, AQI, tmin, O3 and CO

## Lets see how the elements individually have linear regression relationship with the traffic delay


## Ok lets build the base model here
Delhi_Traffic_Delay_Model_AQI = lm(Daily_Delay ~ AQI, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_AQI)

Delhi_Traffic_Delay_Model_tavg = lm(Daily_Delay ~ AQI+tavg, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_tavg)

Delhi_Traffic_Delay_Model_prcp = lm(Daily_Delay ~ AQI+prcp, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_prcp)

Delhi_Traffic_Delay_Model_O3 = lm(Daily_Delay ~ AQI+O3, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_O3)

Delhi_Traffic_Delay_Model_CO = lm(Daily_Delay ~ AQI+CO, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_CO)

Delhi_Traffic_Delay_Model_Month = lm(Daily_Delay ~ AQI+Month, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_Month)

evaluate_model(Delhi_Traffic_Delay_Model_AQI)
##   AQI model_output
## 1   0     65.89103
## 2 200     75.90179
## 3 400     85.91255
evaluate_model(Delhi_Traffic_Delay_Model_tavg, tavg = 35)
##   AQI tavg model_output
## 1   0   35     65.28536
## 2 200   35     74.13083
## 3 400   35     82.97629
evaluate_model(Delhi_Traffic_Delay_Model_prcp, prcp = 150)
##   AQI prcp model_output
## 1   0  150     207.8647
## 2 200  150     221.3815
## 3 400  150     234.8983
evaluate_model(Delhi_Traffic_Delay_Model_O3, O3 = 50)
##   AQI O3 model_output
## 1   0 50     75.87323
## 2 200 50     86.52229
## 3 400 50     97.17135
evaluate_model(Delhi_Traffic_Delay_Model_CO, CO = 1)
##   AQI CO model_output
## 1   0  1     66.60121
## 2 200  1     76.02051
## 3 400  1     85.43981
evaluate_model(Delhi_Traffic_Delay_Model_Month, Month = 12)
##   AQI Month model_output
## 1   0    12     84.65495
## 2 200    12    100.87275
## 3 400    12    117.09055
diff_1 <- 118.1039 - 104.8699
diff_1
## [1] 13.234
diff_2 <- 114.0189 - 103.4728       
diff_2
## [1] 10.5461
diff_3 <- 338.1836 - 319.6457   
diff_3
## [1] 18.5379
diff_4 <- 148.3426 - 133.2912
diff_4
## [1] 15.0514
diff_5 <- 116.9036 - 97.6898    
diff_5
## [1] 19.2138
diff_6 <- 131.5567 - 114.9787   
diff_6
## [1] 16.578
# Comparing the model evalution based on above, we can see that prcp, month and CO has good impact
# on the delay

## To evaluate the base model, split the data into test and train datasets

#make this split reproducible
set.seed(1)

#Use 70% of dataset as training set and remaining 30% as testing set
sample_set <- sample(c(TRUE, FALSE), nrow(Delhi_cohesive_dataset), replace=TRUE, prob=c(0.7,0.3))
train_dataset  <- Delhi_cohesive_dataset[sample_set, ]
test_dataset   <- Delhi_cohesive_dataset[!sample_set, ]

# the base model with just AQI and tavg
Base_Model_Delay = lm(Daily_Delay ~ AQI+prcp+CO, data = train_dataset)
# the Augmented model with precipitation as well
Aug_Model_Delay = lm(Daily_Delay ~ AQI+prcp+CO+Month, data = train_dataset)
# Run cross validation trials on the two models
trials <- cv_pred_error(Base_Model_Delay, Aug_Model_Delay)


# Compare the two sets of cross-validated errors
t.test(mse ~ model, data = trials)
## 
##  Welch Two Sample t-test
## 
## data:  mse by model
## t = -3.7181, df = 5.5958, p-value = 0.0112
## alternative hypothesis: true difference in means between group Aug_Model_Delay and group Base_Model_Delay is not equal to 0
## 95 percent confidence interval:
##  -219.25895  -43.35727
## sample estimates:
##  mean in group Aug_Model_Delay mean in group Base_Model_Delay 
##                       3093.531                       3224.839
# t-statistic is 2.7891. degrees of freedom, df is 6.6445 are the degrees of freedom. These are used with a t-distribution to derive p-value of 0.02842

# p-value = 0.02842 - i.e., Given that there is no actual/true difference in means, if we repeat the experiment over and over again, 2.8% of the time we would see the type of difference in means as in your samples, or a more extreme difference in means. Since p value is significantly lower than 0.05, the differences are significant.
# So we can reject the null hypothesis (H0) of no difference between the (true) averages of the two groups
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# 25.04453 325.43691
#If assume H0 is false, the true mean may lie in the interval [7866.835 7691.594].
# So we will chose the augmented model - i.e., Daily_Delay ~ AQI+prcp+CO+Month

Model for predicting Delhi air traffic delays

## For our model to predict the air traffic delays:
## Response Variable is Daily_Delay
## Explanatory Variables are Precipitation (prcp), AQI, CO and Month

## We are choosing a linear regression model here because this is about predicting the numerical values
## and does not belong to classification modelling
Delhi_Traffic_Delay_Model = lm(Daily_Delay ~ AQI+prcp+CO+Month, data = train_dataset)
summary(Delhi_Traffic_Delay_Model)
## 
## Call:
## lm(formula = Daily_Delay ~ AQI + prcp + CO + Month, data = train_dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.524 -32.566 -13.216   9.351 280.643 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  39.7776    14.0751   2.826  0.00523 **
## AQI           0.1055     0.0416   2.536  0.01203 * 
## prcp          0.9591     0.4172   2.299  0.02263 * 
## CO          -17.1722    13.0220  -1.319  0.18891   
## Month         4.0846     1.2420   3.289  0.00121 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.49 on 184 degrees of freedom
## Multiple R-squared:  0.09215,    Adjusted R-squared:  0.07241 
## F-statistic: 4.669 on 4 and 184 DF,  p-value: 0.001301
Predicted_Traffic_Delay <- predict(Delhi_Traffic_Delay_Model, test_dataset)
Predicted_Traffic_Delay
##         4         6         7        15        17        18        20        21 
##  67.86179  75.13417  69.56807  88.21506  78.60849 131.77652  71.05416 107.11761 
##        29        35        37        39        41        43        46        49 
##  74.06803  72.26148  78.91427 111.01858 106.23819  92.53629  67.35221  59.99422 
##        52        61        68        70        72        76        77        79 
##  62.48193  64.49781  52.87972  64.13694  73.84210  60.09088  48.85431  66.46546 
##        80        82        85        87        94        95        96        99 
##  48.73530  50.33918  45.13479  52.97105  54.68656  59.41816  53.72444  55.23866 
##       104       109       111       112       117       121       125       135 
##  73.44033  56.54381  54.40547  72.88220  63.95472  67.15259  72.10652  69.19774 
##       139       145       148       150       162       164       165       169 
##  74.85769  53.49240 101.39792  48.88850  68.52069  63.48317  62.02874  89.37714 
##       172       173       176       178       180       183       185       187 
##  68.33729  64.47273  67.76635  63.30805  63.98892  80.54250  66.41104  73.81277 
##       188       189       191       194       198       200       210       211 
##  72.46788 109.24905  67.07992  65.66837  65.05209  68.90869  79.90820  76.39041 
##       213       214       215       218       219       225       230       243 
##  72.94546  73.86186  72.29595  72.40341 103.99104  69.99095  76.81900  93.29322 
##       250       251       252       260       264       265 
## 133.32210  85.98529  89.34110  51.35490  56.67005  54.05690
test_dataset["Predicted_Delay"] <- Predicted_Traffic_Delay

Summary_Model_Performace <- test_dataset %>% group_by(YEAR = year(ymd(Date)), Month) %>% summarise(Daily_Delay, Predicted_Delay)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
Summary_Model_Performace
## # A tibble: 78 × 4
## # Groups:   YEAR, Month [18]
##     YEAR Month Daily_Delay Predicted_Delay
##    <dbl> <dbl>       <dbl>           <dbl>
##  1  2018     6          30            67.9
##  2  2018     7          31            75.1
##  3  2018     7           8            69.6
##  4  2018     7          59            88.2
##  5  2018     7          45            78.6
##  6  2018     7          62           132. 
##  7  2018     7         100            71.1
##  8  2018     7          61           107. 
##  9  2018     8         105            74.1
## 10  2018    11          60            72.3
## # ℹ 68 more rows
ggplot(Summary_Model_Performace, aes(x = Month)) +
        geom_point(aes(y = Daily_Delay, color = 'Daily_Delay')) +
        geom_point(aes(y = Predicted_Delay, color = 'Predictede_Delay')) +
         scale_x_continuous(breaks=seq(1, 12, by = 1))+
  labs(title = "Model Performance")  + facet_wrap(~YEAR)

# As we can see, the model is performing a bit OK for some months except for certain extreme
# cases of delays. So, the model needs further fine tuning or dataset needs to be reanalyzed.